Commit 4aa3a0ea73de5edd298638d217cc1ff337be95b1

Authored by Mathias
1 parent 6bc3b63707
Exists in master

Add --onlymeasures flag that allow the user to run the script without training n…

…ew clustering models. It only load the already trained models and calculates the measures. Usefull when you add new measures and you don't want to train clustering models again.

Showing 1 changed file with 32 additions and 22 deletions Inline Diff

scripts/evaluations/clustering.py
1 ''' 1 '''
2 This script allows the user to evaluate a classification system on new labels using clustering methods. 2 This script allows the user to evaluate a classification system on new labels using clustering methods.
3 The algorithms are applied on the given latent space (embedding). 3 The algorithms are applied on the given latent space (embedding).
4 ''' 4 '''
5 import argparse 5 import argparse
6 import numpy as np 6 import numpy as np
7 import pandas as pd 7 import pandas as pd
8 import os 8 import os
9 import time 9 import time
10 import pickle 10 import pickle
11 import csv 11 import csv
12 import json 12 import json
13 13
14 from sklearn.preprocessing import LabelEncoder 14 from sklearn.preprocessing import LabelEncoder
15 from sklearn.metrics.pairwise import pairwise_distances 15 from sklearn.metrics.pairwise import pairwise_distances
16 from sklearn.cluster import KMeans 16 from sklearn.cluster import KMeans
17 from sklearn.manifold import TSNE 17 from sklearn.manifold import TSNE
18 from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score 18 from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score
19 import matplotlib.pyplot as plt 19 import matplotlib.pyplot as plt
20 20
21 from volia.data_io import read_features,read_lst 21 from volia.data_io import read_features,read_lst
22 from volia.measures import entropy_score, purity_score 22 from volia.measures import entropy_score, purity_score
23 23
24 ''' 24 '''
25 TODO: 25 TODO:
26 - Add an option allowing the user to choose the number of 26 - Add an option allowing the user to choose the number of
27 clustering to train in order to compute the average and the 27 clustering to train in order to compute the average and the
28 ''' 28 '''
29 29
30 30
31 def train_clustering(label_encoder, feats, classes, outdir): 31 def train_clustering(label_encoder, feats, classes, outdir):
32 num_classes = len(label_encoder.classes_) 32 num_classes = len(label_encoder.classes_)
33 estimator = None
34 kmeans_filepath = os.path.join(outdir, f"{args.prefix}kmeans.pkl")
35 if args.onlymeasures:
36 print(f"Loading model: {kmeans_filepath}")
37 with open(kmeans_filepath, "rb") as f:
38 estimator = pickle.load(f)
39 else:
40 # Compute KMEANS clustering on data
41 print("Saving parameters")
42 kmeans_parameters = {
43 "n_clusters": num_classes,
44 "n_init": 100,
45 "tol": 10-6,
46 "algorithm": "elkan"
47 }
48 with open(os.path.join(outdir, f"{args.prefix}kmeans_parameters.json"), "w") as f:
49 json.dump(kmeans_parameters, f)
33 50
34 # Compute KMEANS clustering on data 51 # Fit the model and Save parameters
35 kmeans_parameters = { 52 print(f"Fit the model: {kmeans_filepath}")
36 "n_clusters": num_classes, 53 estimator = KMeans(
37 "n_init": 100, 54 **kmeans_parameters
38 "tol": 10-6, 55 )
39 "algorithm": "elkan" 56 estimator.fit(feats)
40 } 57 print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}")
41 with open(os.path.join(outdir, f"{args.prefix}kmeans_parameters.json"), "w") as f:
42 json.dump(kmeans_parameters, f)
43 58
44 # Save parameters 59 with open(kmeans_filepath, "wb") as f:
60 pickle.dump(estimator, f)
45 61
46 estimator = KMeans(
47 **kmeans_parameters
48 )
49 estimator.fit(feats)
50 print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}")
51
52 with open(os.path.join(outdir, f"{args.prefix}kmeans.pkl"), "wb") as f:
53 pickle.dump(estimator, f)
54
55 # contains distance to each cluster for each sample 62 # contains distance to each cluster for each sample
56 dist_space = estimator.transform(feats) 63 dist_space = estimator.transform(feats)
57 predictions = np.argmin(dist_space, axis=1) 64 predictions = np.argmin(dist_space, axis=1)
58 65
59 # gives each cluster a name (considering most represented character) 66 # gives each cluster a name (considering most represented character)
60 dataframe = pd.DataFrame({ 67 dataframe = pd.DataFrame({
61 "label": pd.Series(list(map(lambda x: le.classes_[x], labels))), 68 "label": pd.Series(list(map(lambda x: le.classes_[x], labels))),
62 "prediction": pd.Series(predictions) 69 "prediction": pd.Series(predictions)
63 }) 70 })
64 71
65 def find_cluster_name_fn(c): 72 def find_cluster_name_fn(c):
66 mask = dataframe["prediction"] == c 73 mask = dataframe["prediction"] == c
67 return dataframe[mask]["label"].value_counts(sort=False).idxmax() 74 return dataframe[mask]["label"].value_counts(sort=False).idxmax()
68 75
69 cluster_names = list(map(find_cluster_name_fn, range(num_classes))) 76 cluster_names = list(map(find_cluster_name_fn, range(num_classes)))
70 predicted_labels = le.transform( 77 predicted_labels = le.transform(
71 [cluster_names[pred] for pred in predictions]) 78 [cluster_names[pred] for pred in predictions])
72 79
73 # F-measure 80 # F-measure
74 fscores = f1_score(labels, predicted_labels, average=None) 81 fscores = f1_score(labels, predicted_labels, average=None)
75 fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores)))) 82 fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores))))
76 83
77 # Entropy 84 # Entropy
78 _, _, entropy = entropy_score(labels, predicted_labels) 85 _, _, entropy = entropy_score(labels, predicted_labels)
79 86
80 # Homogenity 87 # Homogenity
81 homogeneity = homogeneity_score(labels, predicted_labels) 88 homogeneity = homogeneity_score(labels, predicted_labels)
82 89
83 # Completeness 90 # Completeness
84 completeness = completeness_score(labels, predicted_labels) 91 completeness = completeness_score(labels, predicted_labels)
85 92
86 # V-Measure 93 # V-Measure
87 v_measure = v_measure_score(labels, predicted_labels) 94 v_measure = v_measure_score(labels, predicted_labels)
88 95
89 # Purity 96 # Purity
90 purity_scores = purity_score(labels, predicted_labels) 97 purity_scores = purity_score(labels, predicted_labels)
91 purity_class_score = purity_scores["purity_class_score"] 98 purity_class_score = purity_scores["purity_class_score"]
92 purity_cluster_score = purity_scores["purity_cluster_score"] 99 purity_cluster_score = purity_scores["purity_cluster_score"]
93 K = purity_scores["K"] 100 K = purity_scores["K"]
94 101
95 # Write results 102 # Write results
96 with open(os.path.join(outdir, args.prefix + "eval_clustering.log"), "w") as fd: 103 with open(os.path.join(outdir, args.prefix + "eval_clustering.log"), "w") as fd:
97 print(f"F1-scores for each classes:\n{fscores_str}", file=fd) 104 print(f"F1-scores for each classes:\n{fscores_str}", file=fd)
98 print(f"Entropy: {entropy}", file=fd) 105 print(f"Entropy: {entropy}", file=fd)
99 print(f"Global score : {np.mean(fscores)}", file=fd) 106 print(f"Global score : {np.mean(fscores)}", file=fd)
100 print(f"Homogeneity: {homogeneity}", file=fd) 107 print(f"Homogeneity: {homogeneity}", file=fd)
101 print(f"completeness: {completeness}", file=fd) 108 print(f"completeness: {completeness}", file=fd)
102 print(f"v-measure: {v_measure}", file=fd) 109 print(f"v-measure: {v_measure}", file=fd)
103 print(f"purity class score: {purity_class_score}", file=fd) 110 print(f"purity class score: {purity_class_score}", file=fd)
104 print(f"purity cluster score: {purity_cluster_score}", file=fd) 111 print(f"purity cluster score: {purity_cluster_score}", file=fd)
105 print(f"purity overall evaluation criterion (K): {K}", file=fd) 112 print(f"purity overall evaluation criterion (K): {K}", file=fd)
106 113
107 # Process t-SNE and plot 114 # Process t-SNE and plot
108 tsne_estimator = TSNE() 115 tsne_estimator = TSNE()
109 embeddings = tsne_estimator.fit_transform(feats) 116 embeddings = tsne_estimator.fit_transform(feats)
110 print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format( 117 print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format(
111 tsne_estimator.n_iter_, tsne_estimator.kl_divergence_)) 118 tsne_estimator.n_iter_, tsne_estimator.kl_divergence_))
112 119
113 fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5)) 120 fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5))
114 for c, name in enumerate(le.classes_): 121 for c, name in enumerate(le.classes_):
115 c_mask = np.where(labels == c) 122 c_mask = np.where(labels == c)
116 axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) 123 axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
117 124
118 try: 125 try:
119 id_cluster = cluster_names.index(name) 126 id_cluster = cluster_names.index(name)
120 except ValueError: 127 except ValueError:
121 print("WARNING: no cluster found for {}".format(name)) 128 print("WARNING: no cluster found for {}".format(name))
122 continue 129 continue
123 c_mask = np.where(predictions == id_cluster) 130 c_mask = np.where(predictions == id_cluster)
124 axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) 131 axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
125 132
126 axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) 133 axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
127 axe1.set_title("true labels") 134 axe1.set_title("true labels")
128 axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) 135 axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
129 axe2.set_title("predicted cluster label") 136 axe2.set_title("predicted cluster label")
130 137
131 plt.suptitle("Kmeans Clustering") 138 plt.suptitle("Kmeans Clustering")
132 139
133 loc = os.path.join( 140 loc = os.path.join(
134 outdir, 141 outdir,
135 args.prefix + "kmeans.pdf" 142 args.prefix + "kmeans.pdf"
136 ) 143 )
137 plt.savefig(loc, bbox_inches="tight") 144 plt.savefig(loc, bbox_inches="tight")
138 plt.close() 145 plt.close()
139 146
140 print("INFO: figure saved at {}".format(loc)) 147 print("INFO: figure saved at {}".format(loc))
141 148
142 end = time.time() 149 end = time.time()
143 print("program ended in {0:.2f} seconds".format(end-start)) 150 print("program ended in {0:.2f} seconds".format(end-start))
144 return { 151 return {
145 "f1": np.mean(fscores), 152 "f1": np.mean(fscores),
146 "entropy": entropy, 153 "entropy": entropy,
147 "homogeneity": homogeneity, 154 "homogeneity": homogeneity,
148 "completeness": completeness, 155 "completeness": completeness,
149 "v-measure": v_measure, 156 "v-measure": v_measure,
150 "purity_class_score": purity_class_score, 157 "purity_class_score": purity_class_score,
151 "purity_cluster score": purity_cluster_score, 158 "purity_cluster score": purity_cluster_score,
152 "K": K 159 "K": K
153 } 160 }
154 161
155 162
156 if __name__ == "__main__": 163 if __name__ == "__main__":
157 # Argparse 164 # Argparse
158 parser = argparse.ArgumentParser("Compute clustering on a latent space") 165 parser = argparse.ArgumentParser("Compute clustering on a latent space")
159 parser.add_argument("features") 166 parser.add_argument("features")
160 parser.add_argument("utt2", 167 parser.add_argument("utt2",
161 type=str, 168 type=str,
162 help="file with [utt] [value]") 169 help="file with [utt] [value]")
163 parser.add_argument("--idsfrom", 170 parser.add_argument("--idsfrom",
164 type=str, 171 type=str,
165 default="utt2", 172 default="utt2",
166 choices=[ 173 choices=[
167 "features", 174 "features",
168 "utt2" 175 "utt2"
169 ], 176 ],
170 help="from features or from utt2?") 177 help="from features or from utt2?")
171 parser.add_argument("--prefix", 178 parser.add_argument("--prefix",
172 default="", 179 default="",
173 type=str, 180 type=str,
174 help="prefix of saved files") 181 help="prefix of saved files")
175 parser.add_argument("--outdir", 182 parser.add_argument("--outdir",
176 default=None, 183 default=None,
177 type=str, 184 type=str,
178 help="Output directory") 185 help="Output directory")
179 parser.add_argument("--nmodels", 186 parser.add_argument("--nmodels",
180 type=int, 187 type=int,
181 default=1, 188 default=1,
182 help="specifies the number of models to train") 189 help="specifies the number of models to train")
190 parser.add_argument("--onlymeasures",
191 action='store_true',
192 help="Don't compute the clustering, compute only the measures")
183 args = parser.parse_args() 193 args = parser.parse_args()
184 194
185 assert args.outdir 195 assert args.outdir
186 196
187 start = time.time() 197 start = time.time()
188 198
189 # Load features and utt2 199 # Load features and utt2
190 features = read_features(args.features) 200 features = read_features(args.features)
191 utt2 = read_lst(args.utt2) 201 utt2 = read_lst(args.utt2)
192 202
193 # Take id list 203 # Take id list
194 if args.idsfrom == "features": 204 if args.idsfrom == "features":
195 ids = list(features.keys()) 205 ids = list(features.keys())
196 elif args.idsfrom == "utt2": 206 elif args.idsfrom == "utt2":
197 ids = list(utt2.keys()) 207 ids = list(utt2.keys())
198 else: 208 else:
199 print(f"idsfrom is not good: {args.idsfrom}") 209 print(f"idsfrom is not good: {args.idsfrom}")
200 exit(1) 210 exit(1)
201 211
202 feats = np.vstack([ features[id_] for id_ in ids ]) 212 feats = np.vstack([ features[id_] for id_ in ids ])
203 classes = [ utt2[id_] for id_ in ids ] 213 classes = [ utt2[id_] for id_ in ids ]
204 214
205 # Encode labels 215 # Encode labels
206 le = LabelEncoder() 216 le = LabelEncoder()
207 labels = le.fit_transform(classes) 217 labels = le.fit_transform(classes)
208 218
209 measures = {} 219 measures = {}
210 for i in range(1, args.nmodels+1): 220 for i in range(1, args.nmodels+1):
211 subdir = os.path.join(args.outdir, str(i)) 221 subdir = os.path.join(args.outdir, str(i))
212 if not os.path.exists(subdir): 222 if not os.path.exists(subdir):
213 os.mkdir(subdir) 223 os.mkdir(subdir)
214 print(f"[{i}/{args.nmodels}] => {subdir}") 224 print(f"[{i}/{args.nmodels}] => {subdir}")
215 results = train_clustering(le, feats, classes, subdir) 225 results = train_clustering(le, feats, classes, subdir)
216 226
217 for key, value in results.items(): 227 for key, value in results.items():
218 if key not in measures: 228 if key not in measures:
219 measures[key] = [] 229 measures[key] = []
220 measures[key].append(results[key]) 230 measures[key].append(results[key])
221 231
222 232
223 # File with results 233 # File with results
224 file_results = os.path.join(args.outdir, args.prefix + "clustering_measures.txt") 234 file_results = os.path.join(args.outdir, args.prefix + "clustering_measures.txt")
225 235
226 with open(file_results, "w") as f: 236 with open(file_results, "w") as f:
227 f.write(f"[nmodels: {args.nmodels}]\n") 237 f.write(f"[nmodels: {args.nmodels}]\n")
228 for key in measures.keys(): 238 for key in measures.keys():
229 values = np.asarray(measures[key], dtype=float) 239 values = np.asarray(measures[key], dtype=float)
230 mean = np.mean(values) 240 mean = np.mean(values)
231 std = np.std(values) 241 std = np.std(values)
232 f.write(f"[{key} => mean: {mean}, std: {std}] \n") 242 f.write(f"[{key} => mean: {mean}, std: {std}] \n")
233 243