Commit 3b7e63994c7b1b562f19ca10c9c7b3b472483644
1 parent
15b183a24d
Exists in
master
Reajust the way to name files using prefix
Showing 1 changed file with 4 additions and 4 deletions Inline Diff
scripts/evaluations/clustering.py
1 | ''' | 1 | ''' |
2 | This script allows the user to evaluate a classification system on new labels using clustering methods. | 2 | This script allows the user to evaluate a classification system on new labels using clustering methods. |
3 | The algorithms are applied on the given latent space (embedding). | 3 | The algorithms are applied on the given latent space (embedding). |
4 | ''' | 4 | ''' |
5 | import argparse | 5 | import argparse |
6 | import numpy as np | 6 | import numpy as np |
7 | import pandas as pd | 7 | import pandas as pd |
8 | import os | 8 | import os |
9 | import time | 9 | import time |
10 | import pickle | 10 | import pickle |
11 | import csv | 11 | import csv |
12 | 12 | ||
13 | from sklearn.preprocessing import LabelEncoder | 13 | from sklearn.preprocessing import LabelEncoder |
14 | from sklearn.metrics.pairwise import pairwise_distances | 14 | from sklearn.metrics.pairwise import pairwise_distances |
15 | from sklearn.cluster import KMeans | 15 | from sklearn.cluster import KMeans |
16 | from sklearn.manifold import TSNE | 16 | from sklearn.manifold import TSNE |
17 | from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score | 17 | from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score |
18 | import matplotlib.pyplot as plt | 18 | import matplotlib.pyplot as plt |
19 | 19 | ||
20 | from volia.data_io import read_features,read_lst | 20 | from volia.data_io import read_features,read_lst |
21 | from volia.measures import entropy_score, purity_score | 21 | from volia.measures import entropy_score, purity_score |
22 | 22 | ||
23 | ''' | 23 | ''' |
24 | TODO: | 24 | TODO: |
25 | - Add an option allowing the user to choose the number of | 25 | - Add an option allowing the user to choose the number of |
26 | clustering to train in order to compute the average and the | 26 | clustering to train in order to compute the average and the |
27 | ''' | 27 | ''' |
28 | 28 | ||
29 | 29 | ||
30 | def train_clustering(label_encoder, feats, classes, outdir): | 30 | def train_clustering(label_encoder, feats, classes, outdir): |
31 | num_classes = len(label_encoder.classes_) | 31 | num_classes = len(label_encoder.classes_) |
32 | 32 | ||
33 | # Compute KMEANS clustering on data | 33 | # Compute KMEANS clustering on data |
34 | estimator = KMeans( | 34 | estimator = KMeans( |
35 | n_clusters=num_classes, | 35 | n_clusters=num_classes, |
36 | n_init=100, | 36 | n_init=100, |
37 | tol=10-6, | 37 | tol=10-6, |
38 | algorithm="elkan" | 38 | algorithm="elkan" |
39 | ) | 39 | ) |
40 | estimator.fit(feats) | 40 | estimator.fit(feats) |
41 | print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}") | 41 | print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}") |
42 | 42 | ||
43 | with open(os.path.join(outdir, f"_kmeans.pkl"), "wb") as f: | 43 | with open(os.path.join(outdir, f"{args.prefix}kmeans.pkl"), "wb") as f: |
44 | pickle.dump(estimator, f) | 44 | pickle.dump(estimator, f) |
45 | 45 | ||
46 | # contains distance to each cluster for each sample | 46 | # contains distance to each cluster for each sample |
47 | dist_space = estimator.transform(feats) | 47 | dist_space = estimator.transform(feats) |
48 | predictions = np.argmin(dist_space, axis=1) | 48 | predictions = np.argmin(dist_space, axis=1) |
49 | 49 | ||
50 | # gives each cluster a name (considering most represented character) | 50 | # gives each cluster a name (considering most represented character) |
51 | dataframe = pd.DataFrame({ | 51 | dataframe = pd.DataFrame({ |
52 | "label": pd.Series(list(map(lambda x: le.classes_[x], labels))), | 52 | "label": pd.Series(list(map(lambda x: le.classes_[x], labels))), |
53 | "prediction": pd.Series(predictions) | 53 | "prediction": pd.Series(predictions) |
54 | }) | 54 | }) |
55 | 55 | ||
56 | def find_cluster_name_fn(c): | 56 | def find_cluster_name_fn(c): |
57 | mask = dataframe["prediction"] == c | 57 | mask = dataframe["prediction"] == c |
58 | return dataframe[mask]["label"].value_counts(sort=False).idxmax() | 58 | return dataframe[mask]["label"].value_counts(sort=False).idxmax() |
59 | 59 | ||
60 | cluster_names = list(map(find_cluster_name_fn, range(num_classes))) | 60 | cluster_names = list(map(find_cluster_name_fn, range(num_classes))) |
61 | predicted_labels = le.transform( | 61 | predicted_labels = le.transform( |
62 | [cluster_names[pred] for pred in predictions]) | 62 | [cluster_names[pred] for pred in predictions]) |
63 | 63 | ||
64 | # F-measure | 64 | # F-measure |
65 | fscores = f1_score(labels, predicted_labels, average=None) | 65 | fscores = f1_score(labels, predicted_labels, average=None) |
66 | fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores)))) | 66 | fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores)))) |
67 | 67 | ||
68 | # Entropy | 68 | # Entropy |
69 | _, _, entropy = entropy_score(labels, predicted_labels) | 69 | _, _, entropy = entropy_score(labels, predicted_labels) |
70 | 70 | ||
71 | # Homogenity | 71 | # Homogenity |
72 | homogeneity = homogeneity_score(labels, predicted_labels) | 72 | homogeneity = homogeneity_score(labels, predicted_labels) |
73 | 73 | ||
74 | # Completeness | 74 | # Completeness |
75 | completeness = completeness_score(labels, predicted_labels) | 75 | completeness = completeness_score(labels, predicted_labels) |
76 | 76 | ||
77 | # V-Measure | 77 | # V-Measure |
78 | v_measure = v_measure_score(labels, predicted_labels) | 78 | v_measure = v_measure_score(labels, predicted_labels) |
79 | 79 | ||
80 | # Purity | 80 | # Purity |
81 | purity_scores = purity_score(labels, predicted_labels) | 81 | purity_scores = purity_score(labels, predicted_labels) |
82 | purity_class_score = purity_scores["purity_class_score"] | 82 | purity_class_score = purity_scores["purity_class_score"] |
83 | purity_cluster_score = purity_scores["purity_cluster_score"] | 83 | purity_cluster_score = purity_scores["purity_cluster_score"] |
84 | K = purity_scores["K"] | 84 | K = purity_scores["K"] |
85 | 85 | ||
86 | # Write results | 86 | # Write results |
87 | with open(os.path.join(outdir, f"_" + args.prefix + "eval_clustering.log"), "w") as fd: | 87 | with open(os.path.join(outdir, args.prefix + "eval_clustering.log"), "w") as fd: |
88 | print(f"F1-scores for each classes:\n{fscores_str}", file=fd) | 88 | print(f"F1-scores for each classes:\n{fscores_str}", file=fd) |
89 | print(f"Entropy: {entropy}", file=fd) | 89 | print(f"Entropy: {entropy}", file=fd) |
90 | print(f"Global score : {np.mean(fscores)}", file=fd) | 90 | print(f"Global score : {np.mean(fscores)}", file=fd) |
91 | print(f"Homogeneity: {homogeneity}", file=fd) | 91 | print(f"Homogeneity: {homogeneity}", file=fd) |
92 | print(f"completeness: {completeness}", file=fd) | 92 | print(f"completeness: {completeness}", file=fd) |
93 | print(f"v-measure: {v_measure}", file=fd) | 93 | print(f"v-measure: {v_measure}", file=fd) |
94 | print(f"purity class score: {purity_class_score}", file=fd) | 94 | print(f"purity class score: {purity_class_score}", file=fd) |
95 | print(f"purity cluster score: {purity_cluster_score}", file=fd) | 95 | print(f"purity cluster score: {purity_cluster_score}", file=fd) |
96 | print(f"purity overall evaluation criterion (K): {K}", file=fd) | 96 | print(f"purity overall evaluation criterion (K): {K}", file=fd) |
97 | 97 | ||
98 | # Process t-SNE and plot | 98 | # Process t-SNE and plot |
99 | tsne_estimator = TSNE() | 99 | tsne_estimator = TSNE() |
100 | embeddings = tsne_estimator.fit_transform(feats) | 100 | embeddings = tsne_estimator.fit_transform(feats) |
101 | print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format( | 101 | print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format( |
102 | tsne_estimator.n_iter_, tsne_estimator.kl_divergence_)) | 102 | tsne_estimator.n_iter_, tsne_estimator.kl_divergence_)) |
103 | 103 | ||
104 | fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5)) | 104 | fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5)) |
105 | for c, name in enumerate(le.classes_): | 105 | for c, name in enumerate(le.classes_): |
106 | c_mask = np.where(labels == c) | 106 | c_mask = np.where(labels == c) |
107 | axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) | 107 | axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) |
108 | 108 | ||
109 | try: | 109 | try: |
110 | id_cluster = cluster_names.index(name) | 110 | id_cluster = cluster_names.index(name) |
111 | except ValueError: | 111 | except ValueError: |
112 | print("WARNING: no cluster found for {}".format(name)) | 112 | print("WARNING: no cluster found for {}".format(name)) |
113 | continue | 113 | continue |
114 | c_mask = np.where(predictions == id_cluster) | 114 | c_mask = np.where(predictions == id_cluster) |
115 | axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) | 115 | axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) |
116 | 116 | ||
117 | axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) | 117 | axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) |
118 | axe1.set_title("true labels") | 118 | axe1.set_title("true labels") |
119 | axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) | 119 | axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) |
120 | axe2.set_title("predicted cluster label") | 120 | axe2.set_title("predicted cluster label") |
121 | 121 | ||
122 | plt.suptitle("Kmeans Clustering") | 122 | plt.suptitle("Kmeans Clustering") |
123 | 123 | ||
124 | loc = os.path.join( | 124 | loc = os.path.join( |
125 | outdir, | 125 | outdir, |
126 | args.prefix + "kmeans.pdf" | 126 | args.prefix + "kmeans.pdf" |
127 | ) | 127 | ) |
128 | plt.savefig(loc, bbox_inches="tight") | 128 | plt.savefig(loc, bbox_inches="tight") |
129 | plt.close() | 129 | plt.close() |
130 | 130 | ||
131 | print("INFO: figure saved at {}".format(loc)) | 131 | print("INFO: figure saved at {}".format(loc)) |
132 | 132 | ||
133 | end = time.time() | 133 | end = time.time() |
134 | print("program ended in {0:.2f} seconds".format(end-start)) | 134 | print("program ended in {0:.2f} seconds".format(end-start)) |
135 | return { | 135 | return { |
136 | "f1": np.mean(fscores), | 136 | "f1": np.mean(fscores), |
137 | "entropy": entropy, | 137 | "entropy": entropy, |
138 | "homogeneity": homogeneity, | 138 | "homogeneity": homogeneity, |
139 | "completeness": completeness, | 139 | "completeness": completeness, |
140 | "v-measure": v_measure, | 140 | "v-measure": v_measure, |
141 | "purity_class_score": purity_class_score, | 141 | "purity_class_score": purity_class_score, |
142 | "purity_cluster score": purity_cluster_score, | 142 | "purity_cluster score": purity_cluster_score, |
143 | "K": K | 143 | "K": K |
144 | } | 144 | } |
145 | 145 | ||
146 | 146 | ||
147 | if __name__ == "__main__": | 147 | if __name__ == "__main__": |
148 | # Argparse | 148 | # Argparse |
149 | parser = argparse.ArgumentParser("Compute clustering on a latent space") | 149 | parser = argparse.ArgumentParser("Compute clustering on a latent space") |
150 | parser.add_argument("features") | 150 | parser.add_argument("features") |
151 | parser.add_argument("utt2", | 151 | parser.add_argument("utt2", |
152 | type=str, | 152 | type=str, |
153 | help="file with [utt] [value]") | 153 | help="file with [utt] [value]") |
154 | parser.add_argument("--idsfrom", | 154 | parser.add_argument("--idsfrom", |
155 | type=str, | 155 | type=str, |
156 | default="utt2", | 156 | default="utt2", |
157 | choices=[ | 157 | choices=[ |
158 | "features", | 158 | "features", |
159 | "utt2" | 159 | "utt2" |
160 | ], | 160 | ], |
161 | help="from features or from utt2?") | 161 | help="from features or from utt2?") |
162 | parser.add_argument("--prefix", | 162 | parser.add_argument("--prefix", |
163 | default="", | 163 | default="", |
164 | type=str, | 164 | type=str, |
165 | help="prefix of saved files") | 165 | help="prefix of saved files") |
166 | parser.add_argument("--outdir", | 166 | parser.add_argument("--outdir", |
167 | default=None, | 167 | default=None, |
168 | type=str, | 168 | type=str, |
169 | help="Output directory") | 169 | help="Output directory") |
170 | parser.add_argument("--nmodels", | 170 | parser.add_argument("--nmodels", |
171 | type=int, | 171 | type=int, |
172 | default=1, | 172 | default=1, |
173 | help="specifies the number of models to train") | 173 | help="specifies the number of models to train") |
174 | args = parser.parse_args() | 174 | args = parser.parse_args() |
175 | 175 | ||
176 | assert args.outdir | 176 | assert args.outdir |
177 | 177 | ||
178 | start = time.time() | 178 | start = time.time() |
179 | 179 | ||
180 | # Load features and utt2 | 180 | # Load features and utt2 |
181 | features = read_features(args.features) | 181 | features = read_features(args.features) |
182 | utt2 = read_lst(args.utt2) | 182 | utt2 = read_lst(args.utt2) |
183 | 183 | ||
184 | # Take id list | 184 | # Take id list |
185 | if args.idsfrom == "features": | 185 | if args.idsfrom == "features": |
186 | ids = list(features.keys()) | 186 | ids = list(features.keys()) |
187 | elif args.idsfrom == "utt2": | 187 | elif args.idsfrom == "utt2": |
188 | ids = list(utt2.keys()) | 188 | ids = list(utt2.keys()) |
189 | else: | 189 | else: |
190 | print(f"idsfrom is not good: {args.idsfrom}") | 190 | print(f"idsfrom is not good: {args.idsfrom}") |
191 | exit(1) | 191 | exit(1) |
192 | 192 | ||
193 | feats = np.vstack([ features[id_] for id_ in ids ]) | 193 | feats = np.vstack([ features[id_] for id_ in ids ]) |
194 | classes = [ utt2[id_] for id_ in ids ] | 194 | classes = [ utt2[id_] for id_ in ids ] |
195 | 195 | ||
196 | # Encode labels | 196 | # Encode labels |
197 | le = LabelEncoder() | 197 | le = LabelEncoder() |
198 | labels = le.fit_transform(classes) | 198 | labels = le.fit_transform(classes) |
199 | 199 | ||
200 | measures = {} | 200 | measures = {} |
201 | for i in range(1, args.nmodels+1): | 201 | for i in range(1, args.nmodels+1): |
202 | subdir = os.path.join(args.outdir, str(i)) | 202 | subdir = os.path.join(args.outdir, str(i)) |
203 | if not os.path.exists(subdir): | 203 | if not os.path.exists(subdir): |
204 | os.mkdir(subdir) | 204 | os.mkdir(subdir) |
205 | print(f"[{i}/{args.nmodels}] => {subdir}") | 205 | print(f"[{i}/{args.nmodels}] => {subdir}") |
206 | results = train_clustering(le, feats, classes, subdir) | 206 | results = train_clustering(le, feats, classes, subdir) |
207 | 207 | ||
208 | for key, value in results.items(): | 208 | for key, value in results.items(): |
209 | if key not in measures: | 209 | if key not in measures: |
210 | measures[key] = [] | 210 | measures[key] = [] |
211 | measures[key].append(results[key]) | 211 | measures[key].append(results[key]) |
212 | 212 | ||
213 | 213 | ||
214 | # File with results | 214 | # File with results |
215 | file_results = os.path.join(args.outdir, "clustering_measures.txt") | 215 | file_results = os.path.join(args.outdir, args.prefix + "clustering_measures.txt") |
216 | 216 | ||
217 | with open(file_results, "w") as f: | 217 | with open(file_results, "w") as f: |
218 | f.write(f"[nmodels: {args.nmodels}]\n") | 218 | f.write(f"[nmodels: {args.nmodels}]\n") |
219 | for key in measures.keys(): | 219 | for key in measures.keys(): |
220 | values = np.asarray(measures[key], dtype=float) | 220 | values = np.asarray(measures[key], dtype=float) |
221 | mean = np.mean(values) | 221 | mean = np.mean(values) |
222 | std = np.std(values) | 222 | std = np.std(values) |
223 | f.write(f"[{key} => mean: {mean}, std: {std}] \n") | 223 | f.write(f"[{key} => mean: {mean}, std: {std}] \n") |
224 | 224 | ||
225 | # CSV File with all the values | 225 | # CSV File with all the values |
226 | file_csv_measures = os.path.join(args.outdir, "clustering_measures.csv") | 226 | file_csv_measures = os.path.join(args.outdir, args.prefix + "clustering_measures.csv") |
227 | 227 | ||
228 | with open(file_csv_measures, "w", newline="") as f: | 228 | with open(file_csv_measures, "w", newline="") as f: |
229 | writer = csv.writer(f, delimiter=",") | 229 | writer = csv.writer(f, delimiter=",") |
230 | writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"]) | 230 | writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"]) |
231 | for key in measures.keys(): | 231 | for key in measures.keys(): |
232 | values = np.asarray(measures[key], dtype=float) | 232 | values = np.asarray(measures[key], dtype=float) |
233 | mean = np.mean(values) | 233 | mean = np.mean(values) |
234 | std = np.std(values) | 234 | std = np.std(values) |
235 | writer.writerow([key] + list(values) + [mean] + [std]) | 235 | writer.writerow([key] + list(values) + [mean] + [std]) |