Commit 4ed3ebc7d7a764a8ea50a8a9c58d4edb83799c99

Authored by Mathias
1 parent 1f8612ebfd
Exists in master

Save results on a csv file now

Showing 1 changed file with 131 additions and 56 deletions Inline Diff

scripts/evaluations/clustering.py
1 ''' 1 '''
2 This script allows the user to evaluate a classification system on new labels using clustering methods. 2 This script allows the user to evaluate a classification system on new labels using clustering methods.
3 The algorithms are applied on the given latent space (embedding). 3 The algorithms are applied on the given latent space (embedding).
4 ''' 4 '''
5 import argparse 5 import argparse
6 import numpy as np 6 import numpy as np
7 import pandas as pd 7 import pandas as pd
8 import os 8 import os
9 import time 9 import time
10 import pickle 10 import pickle
11 import csv
12
11 from sklearn.preprocessing import LabelEncoder 13 from sklearn.preprocessing import LabelEncoder
12 from sklearn.metrics.pairwise import pairwise_distances 14 from sklearn.metrics.pairwise import pairwise_distances
13 from sklearn.metrics import f1_score
14 from sklearn.cluster import KMeans 15 from sklearn.cluster import KMeans
15 from sklearn.manifold import TSNE 16 from sklearn.manifold import TSNE
17 from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score
16 import matplotlib.pyplot as plt 18 import matplotlib.pyplot as plt
17 19
18 from volia.data_io import read_features,read_lst 20 from volia.data_io import read_features,read_lst
21 from volia.measures import entropy_score
19 22
20 if __name__ == "__main__": 23 '''
21 # Argparse 24 TODO:
22 parser = argparse.ArgumentParser("Compute clustering on a latent space") 25 - Add an option allowing the user to choose the number of
23 parser.add_argument("features") 26 clustering to train in order to compute the average and the
24 parser.add_argument("utt2", 27 '''
25 type=str,
26 help="file with [utt] [value]")
27 parser.add_argument("--idsfrom",
28 type=str,
29 default="utt2",
30 choices=[
31 "features",
32 "utt2"
33 ],
34 help="from features or from utt2?")
35 parser.add_argument("--prefix",
36 default="",
37 type=str,
38 help="prefix of saved files")
39 parser.add_argument("--outdir",
40 default=None,
41 type=str,
42 help="Output directory")
43
44 args = parser.parse_args()
45 28
46 assert args.outdir
47 29
48 start = time.time() 30 def train_clustering(label_encoder, feats, classes, outdir):
31 num_classes = len(label_encoder.classes_)
49 32
50 # Load features and utt2
51 features = read_features(args.features)
52 utt2 = read_lst(args.utt2)
53
54 # Take id list
55 if args.idsfrom == "features":
56 ids = list(features.keys())
57 elif args.idsfrom == "utt2":
58 ids = list(utt2.keys())
59 else:
60 print(f"idsfrom is not good: {args.idsfrom}")
61 exit(1)
62
63 feats = np.vstack([ features[id_] for id_ in ids ])
64 classes = [ utt2[id_] for id_ in ids ]
65
66 # Encode labels
67 le = LabelEncoder()
68 labels = le.fit_transform(classes)
69 num_classes = len(le.classes_)
70
71 # Compute KMEANS clustering on data 33 # Compute KMEANS clustering on data
72 estimator = KMeans( 34 estimator = KMeans(
73 n_clusters=num_classes, 35 n_clusters=num_classes,
74 n_init=100, 36 n_init=100,
75 tol=10-6, 37 tol=10-6,
76 algorithm="elkan" 38 algorithm="elkan"
77 ) 39 )
78 estimator.fit(feats) 40 estimator.fit(feats)
79 print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}") 41 print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}")
80 42
81 with open(os.path.join(args.outdir, "kmeans.pkl"), "wb") as f: 43 with open(os.path.join(outdir, f"_kmeans.pkl"), "wb") as f:
82 pickle.dump(estimator, f) 44 pickle.dump(estimator, f)
83 45
84 # contains distance to each cluster for each sample 46 # contains distance to each cluster for each sample
85 dist_space = estimator.transform(feats) 47 dist_space = estimator.transform(feats)
86 predictions = np.argmin(dist_space, axis=1) 48 predictions = np.argmin(dist_space, axis=1)
87 49
88 # gives each cluster a name (considering most represented character) 50 # gives each cluster a name (considering most represented character)
89 dataframe = pd.DataFrame({ 51 dataframe = pd.DataFrame({
90 "label": pd.Series(list(map(lambda x: le.classes_[x], labels))), 52 "label": pd.Series(list(map(lambda x: le.classes_[x], labels))),
91 "prediction": pd.Series(predictions) 53 "prediction": pd.Series(predictions)
92 }) 54 })
93 55
94 def find_cluster_name_fn(c): 56 def find_cluster_name_fn(c):
95 mask = dataframe["prediction"] == c 57 mask = dataframe["prediction"] == c
96 return dataframe[mask]["label"].value_counts(sort=False).idxmax() 58 return dataframe[mask]["label"].value_counts(sort=False).idxmax()
97 59
98 cluster_names = list(map(find_cluster_name_fn, range(num_classes))) 60 cluster_names = list(map(find_cluster_name_fn, range(num_classes)))
99 predicted_labels = le.transform( 61 predicted_labels = le.transform(
100 [cluster_names[pred] for pred in predictions]) 62 [cluster_names[pred] for pred in predictions])
101 63
102 # F-measure 64 # F-measure
103 fscores = f1_score(labels, predicted_labels, average=None) 65 fscores = f1_score(labels, predicted_labels, average=None)
104 fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores)))) 66 fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores))))
105 print(f"F1-scores for each classes:\n{fscores_str}") 67
106 print(f"Global score : {np.mean(fscores)}") 68 # Entropy
107 with open(os.path.join(args.outdir, args.prefix + "eval_clustering.log"), "w") as fd: 69 _, _, entropy = entropy_score(labels, predicted_labels)
70
71 # Homogenity
72 homogeneity = homogeneity_score(labels, predicted_labels)
73
74 # Completeness
75 completeness = completeness_score(labels, predicted_labels)
76
77 # V-Measure
78 v_measure = v_measure_score(labels, predicted_labels)
79
80 # Write results
81 with open(os.path.join(outdir, f"_" + args.prefix + "eval_clustering.log"), "w") as fd:
108 print(f"F1-scores for each classes:\n{fscores_str}", file=fd) 82 print(f"F1-scores for each classes:\n{fscores_str}", file=fd)
83 print(f"Entropy: {entropy}", file=fd)
109 print(f"Global score : {np.mean(fscores)}", file=fd) 84 print(f"Global score : {np.mean(fscores)}", file=fd)
110 85 print(f"Homogeneity: {homogeneity}", file=fd)
86 print(f"completeness: {completeness}", file=fd)
87 print(f"v-measure: {v_measure}", file=fd)
88
111 # Process t-SNE and plot 89 # Process t-SNE and plot
112 tsne_estimator = TSNE() 90 tsne_estimator = TSNE()
113 embeddings = tsne_estimator.fit_transform(feats) 91 embeddings = tsne_estimator.fit_transform(feats)
114 print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format( 92 print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format(
115 tsne_estimator.n_iter_, tsne_estimator.kl_divergence_)) 93 tsne_estimator.n_iter_, tsne_estimator.kl_divergence_))
116 94
117 fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5)) 95 fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5))
118 for c, name in enumerate(le.classes_): 96 for c, name in enumerate(le.classes_):
119 c_mask = np.where(labels == c) 97 c_mask = np.where(labels == c)
120 axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) 98 axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
121 99
122 try: 100 try:
123 id_cluster = cluster_names.index(name) 101 id_cluster = cluster_names.index(name)
124 except ValueError: 102 except ValueError:
125 print("WARNING: no cluster found for {}".format(name)) 103 print("WARNING: no cluster found for {}".format(name))
126 continue 104 continue
127 c_mask = np.where(predictions == id_cluster) 105 c_mask = np.where(predictions == id_cluster)
128 axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) 106 axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
129 107
130 axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) 108 axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
131 axe1.set_title("true labels") 109 axe1.set_title("true labels")
132 axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) 110 axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
133 axe2.set_title("predicted cluster label") 111 axe2.set_title("predicted cluster label")
134 112
135 plt.suptitle("Kmeans Clustering") 113 plt.suptitle("Kmeans Clustering")
136 114
137 loc = os.path.join( 115 loc = os.path.join(
138 args.outdir, 116 outdir,
139 args.prefix + "kmeans.pdf" 117 args.prefix + "kmeans.pdf"
140 ) 118 )
141 plt.savefig(loc, bbox_inches="tight") 119 plt.savefig(loc, bbox_inches="tight")
142 plt.close() 120 plt.close()
143 121
144 print("INFO: figure saved at {}".format(loc)) 122 print("INFO: figure saved at {}".format(loc))
145 123
146 end = time.time() 124 end = time.time()
147 print("program ended in {0:.2f} seconds".format(end-start)) 125 print("program ended in {0:.2f} seconds".format(end-start))
126 return {
127 "f1": np.mean(fscores),
128 "entropy": entropy,
129 "homogeneity": homogeneity,
130 "completeness": completeness,
131 "v-measure": v_measure
132 }
133
134 if __name__ == "__main__":
135 # Argparse
136 parser = argparse.ArgumentParser("Compute clustering on a latent space")
137 parser.add_argument("features")
138 parser.add_argument("utt2",
139 type=str,
140 help="file with [utt] [value]")
141 parser.add_argument("--idsfrom",
142 type=str,
143 default="utt2",
144 choices=[
145 "features",
146 "utt2"
147 ],
148 help="from features or from utt2?")
149 parser.add_argument("--prefix",
150 default="",
151 type=str,
152 help="prefix of saved files")
153 parser.add_argument("--outdir",
154 default=None,
155 type=str,
156 help="Output directory")
157 parser.add_argument("--nmodels",
158 type=int,
159 default=1,
160 help="specifies the number of models to train")
161 args = parser.parse_args()
162
163 assert args.outdir
164
165 start = time.time()
166
167 # Load features and utt2
168 features = read_features(args.features)
169 utt2 = read_lst(args.utt2)
170
171 # Take id list
172 if args.idsfrom == "features":
173 ids = list(features.keys())
174 elif args.idsfrom == "utt2":
175 ids = list(utt2.keys())
176 else:
177 print(f"idsfrom is not good: {args.idsfrom}")
178 exit(1)
179