Commit 4aa3a0ea73de5edd298638d217cc1ff337be95b1
1 parent
6bc3b63707
Exists in
master
Add --onlymeasures flag that allow the user to run the script without training n…
…ew clustering models. It only load the already trained models and calculates the measures. Usefull when you add new measures and you don't want to train clustering models again.
Showing 1 changed file with 32 additions and 22 deletions Inline Diff
scripts/evaluations/clustering.py
1 | ''' | 1 | ''' |
2 | This script allows the user to evaluate a classification system on new labels using clustering methods. | 2 | This script allows the user to evaluate a classification system on new labels using clustering methods. |
3 | The algorithms are applied on the given latent space (embedding). | 3 | The algorithms are applied on the given latent space (embedding). |
4 | ''' | 4 | ''' |
5 | import argparse | 5 | import argparse |
6 | import numpy as np | 6 | import numpy as np |
7 | import pandas as pd | 7 | import pandas as pd |
8 | import os | 8 | import os |
9 | import time | 9 | import time |
10 | import pickle | 10 | import pickle |
11 | import csv | 11 | import csv |
12 | import json | 12 | import json |
13 | 13 | ||
14 | from sklearn.preprocessing import LabelEncoder | 14 | from sklearn.preprocessing import LabelEncoder |
15 | from sklearn.metrics.pairwise import pairwise_distances | 15 | from sklearn.metrics.pairwise import pairwise_distances |
16 | from sklearn.cluster import KMeans | 16 | from sklearn.cluster import KMeans |
17 | from sklearn.manifold import TSNE | 17 | from sklearn.manifold import TSNE |
18 | from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score | 18 | from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score |
19 | import matplotlib.pyplot as plt | 19 | import matplotlib.pyplot as plt |
20 | 20 | ||
21 | from volia.data_io import read_features,read_lst | 21 | from volia.data_io import read_features,read_lst |
22 | from volia.measures import entropy_score, purity_score | 22 | from volia.measures import entropy_score, purity_score |
23 | 23 | ||
24 | ''' | 24 | ''' |
25 | TODO: | 25 | TODO: |
26 | - Add an option allowing the user to choose the number of | 26 | - Add an option allowing the user to choose the number of |
27 | clustering to train in order to compute the average and the | 27 | clustering to train in order to compute the average and the |
28 | ''' | 28 | ''' |
29 | 29 | ||
30 | 30 | ||
31 | def train_clustering(label_encoder, feats, classes, outdir): | 31 | def train_clustering(label_encoder, feats, classes, outdir): |
32 | num_classes = len(label_encoder.classes_) | 32 | num_classes = len(label_encoder.classes_) |
33 | estimator = None | ||
34 | kmeans_filepath = os.path.join(outdir, f"{args.prefix}kmeans.pkl") | ||
35 | if args.onlymeasures: | ||
36 | print(f"Loading model: {kmeans_filepath}") | ||
37 | with open(kmeans_filepath, "rb") as f: | ||
38 | estimator = pickle.load(f) | ||
39 | else: | ||
40 | # Compute KMEANS clustering on data | ||
41 | print("Saving parameters") | ||
42 | kmeans_parameters = { | ||
43 | "n_clusters": num_classes, | ||
44 | "n_init": 100, | ||
45 | "tol": 10-6, | ||
46 | "algorithm": "elkan" | ||
47 | } | ||
48 | with open(os.path.join(outdir, f"{args.prefix}kmeans_parameters.json"), "w") as f: | ||
49 | json.dump(kmeans_parameters, f) | ||
33 | 50 | ||
34 | # Compute KMEANS clustering on data | 51 | # Fit the model and Save parameters |
35 | kmeans_parameters = { | 52 | print(f"Fit the model: {kmeans_filepath}") |
36 | "n_clusters": num_classes, | 53 | estimator = KMeans( |
37 | "n_init": 100, | 54 | **kmeans_parameters |
38 | "tol": 10-6, | 55 | ) |
39 | "algorithm": "elkan" | 56 | estimator.fit(feats) |
40 | } | 57 | print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}") |
41 | with open(os.path.join(outdir, f"{args.prefix}kmeans_parameters.json"), "w") as f: | ||
42 | json.dump(kmeans_parameters, f) | ||
43 | 58 | ||
44 | # Save parameters | 59 | with open(kmeans_filepath, "wb") as f: |
60 | pickle.dump(estimator, f) | ||
45 | 61 | ||
46 | estimator = KMeans( | ||
47 | **kmeans_parameters | ||
48 | ) | ||
49 | estimator.fit(feats) | ||
50 | print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}") | ||
51 | |||
52 | with open(os.path.join(outdir, f"{args.prefix}kmeans.pkl"), "wb") as f: | ||
53 | pickle.dump(estimator, f) | ||
54 | |||
55 | # contains distance to each cluster for each sample | 62 | # contains distance to each cluster for each sample |
56 | dist_space = estimator.transform(feats) | 63 | dist_space = estimator.transform(feats) |
57 | predictions = np.argmin(dist_space, axis=1) | 64 | predictions = np.argmin(dist_space, axis=1) |
58 | 65 | ||
59 | # gives each cluster a name (considering most represented character) | 66 | # gives each cluster a name (considering most represented character) |
60 | dataframe = pd.DataFrame({ | 67 | dataframe = pd.DataFrame({ |
61 | "label": pd.Series(list(map(lambda x: le.classes_[x], labels))), | 68 | "label": pd.Series(list(map(lambda x: le.classes_[x], labels))), |
62 | "prediction": pd.Series(predictions) | 69 | "prediction": pd.Series(predictions) |
63 | }) | 70 | }) |
64 | 71 | ||
65 | def find_cluster_name_fn(c): | 72 | def find_cluster_name_fn(c): |
66 | mask = dataframe["prediction"] == c | 73 | mask = dataframe["prediction"] == c |
67 | return dataframe[mask]["label"].value_counts(sort=False).idxmax() | 74 | return dataframe[mask]["label"].value_counts(sort=False).idxmax() |
68 | 75 | ||
69 | cluster_names = list(map(find_cluster_name_fn, range(num_classes))) | 76 | cluster_names = list(map(find_cluster_name_fn, range(num_classes))) |
70 | predicted_labels = le.transform( | 77 | predicted_labels = le.transform( |
71 | [cluster_names[pred] for pred in predictions]) | 78 | [cluster_names[pred] for pred in predictions]) |
72 | 79 | ||
73 | # F-measure | 80 | # F-measure |
74 | fscores = f1_score(labels, predicted_labels, average=None) | 81 | fscores = f1_score(labels, predicted_labels, average=None) |
75 | fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores)))) | 82 | fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores)))) |
76 | 83 | ||
77 | # Entropy | 84 | # Entropy |
78 | _, _, entropy = entropy_score(labels, predicted_labels) | 85 | _, _, entropy = entropy_score(labels, predicted_labels) |
79 | 86 | ||
80 | # Homogenity | 87 | # Homogenity |
81 | homogeneity = homogeneity_score(labels, predicted_labels) | 88 | homogeneity = homogeneity_score(labels, predicted_labels) |
82 | 89 | ||
83 | # Completeness | 90 | # Completeness |
84 | completeness = completeness_score(labels, predicted_labels) | 91 | completeness = completeness_score(labels, predicted_labels) |
85 | 92 | ||
86 | # V-Measure | 93 | # V-Measure |
87 | v_measure = v_measure_score(labels, predicted_labels) | 94 | v_measure = v_measure_score(labels, predicted_labels) |
88 | 95 | ||
89 | # Purity | 96 | # Purity |
90 | purity_scores = purity_score(labels, predicted_labels) | 97 | purity_scores = purity_score(labels, predicted_labels) |
91 | purity_class_score = purity_scores["purity_class_score"] | 98 | purity_class_score = purity_scores["purity_class_score"] |
92 | purity_cluster_score = purity_scores["purity_cluster_score"] | 99 | purity_cluster_score = purity_scores["purity_cluster_score"] |
93 | K = purity_scores["K"] | 100 | K = purity_scores["K"] |
94 | 101 | ||
95 | # Write results | 102 | # Write results |
96 | with open(os.path.join(outdir, args.prefix + "eval_clustering.log"), "w") as fd: | 103 | with open(os.path.join(outdir, args.prefix + "eval_clustering.log"), "w") as fd: |
97 | print(f"F1-scores for each classes:\n{fscores_str}", file=fd) | 104 | print(f"F1-scores for each classes:\n{fscores_str}", file=fd) |
98 | print(f"Entropy: {entropy}", file=fd) | 105 | print(f"Entropy: {entropy}", file=fd) |
99 | print(f"Global score : {np.mean(fscores)}", file=fd) | 106 | print(f"Global score : {np.mean(fscores)}", file=fd) |
100 | print(f"Homogeneity: {homogeneity}", file=fd) | 107 | print(f"Homogeneity: {homogeneity}", file=fd) |
101 | print(f"completeness: {completeness}", file=fd) | 108 | print(f"completeness: {completeness}", file=fd) |
102 | print(f"v-measure: {v_measure}", file=fd) | 109 | print(f"v-measure: {v_measure}", file=fd) |
103 | print(f"purity class score: {purity_class_score}", file=fd) | 110 | print(f"purity class score: {purity_class_score}", file=fd) |
104 | print(f"purity cluster score: {purity_cluster_score}", file=fd) | 111 | print(f"purity cluster score: {purity_cluster_score}", file=fd) |
105 | print(f"purity overall evaluation criterion (K): {K}", file=fd) | 112 | print(f"purity overall evaluation criterion (K): {K}", file=fd) |
106 | 113 | ||
107 | # Process t-SNE and plot | 114 | # Process t-SNE and plot |
108 | tsne_estimator = TSNE() | 115 | tsne_estimator = TSNE() |
109 | embeddings = tsne_estimator.fit_transform(feats) | 116 | embeddings = tsne_estimator.fit_transform(feats) |
110 | print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format( | 117 | print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format( |
111 | tsne_estimator.n_iter_, tsne_estimator.kl_divergence_)) | 118 | tsne_estimator.n_iter_, tsne_estimator.kl_divergence_)) |
112 | 119 | ||
113 | fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5)) | 120 | fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5)) |
114 | for c, name in enumerate(le.classes_): | 121 | for c, name in enumerate(le.classes_): |
115 | c_mask = np.where(labels == c) | 122 | c_mask = np.where(labels == c) |
116 | axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) | 123 | axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) |
117 | 124 | ||
118 | try: | 125 | try: |
119 | id_cluster = cluster_names.index(name) | 126 | id_cluster = cluster_names.index(name) |
120 | except ValueError: | 127 | except ValueError: |
121 | print("WARNING: no cluster found for {}".format(name)) | 128 | print("WARNING: no cluster found for {}".format(name)) |
122 | continue | 129 | continue |
123 | c_mask = np.where(predictions == id_cluster) | 130 | c_mask = np.where(predictions == id_cluster) |
124 | axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) | 131 | axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) |
125 | 132 | ||
126 | axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) | 133 | axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) |
127 | axe1.set_title("true labels") | 134 | axe1.set_title("true labels") |
128 | axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) | 135 | axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) |
129 | axe2.set_title("predicted cluster label") | 136 | axe2.set_title("predicted cluster label") |
130 | 137 | ||
131 | plt.suptitle("Kmeans Clustering") | 138 | plt.suptitle("Kmeans Clustering") |
132 | 139 | ||
133 | loc = os.path.join( | 140 | loc = os.path.join( |
134 | outdir, | 141 | outdir, |
135 | args.prefix + "kmeans.pdf" | 142 | args.prefix + "kmeans.pdf" |
136 | ) | 143 | ) |
137 | plt.savefig(loc, bbox_inches="tight") | 144 | plt.savefig(loc, bbox_inches="tight") |
138 | plt.close() | 145 | plt.close() |
139 | 146 | ||
140 | print("INFO: figure saved at {}".format(loc)) | 147 | print("INFO: figure saved at {}".format(loc)) |
141 | 148 | ||
142 | end = time.time() | 149 | end = time.time() |
143 | print("program ended in {0:.2f} seconds".format(end-start)) | 150 | print("program ended in {0:.2f} seconds".format(end-start)) |
144 | return { | 151 | return { |
145 | "f1": np.mean(fscores), | 152 | "f1": np.mean(fscores), |
146 | "entropy": entropy, | 153 | "entropy": entropy, |
147 | "homogeneity": homogeneity, | 154 | "homogeneity": homogeneity, |
148 | "completeness": completeness, | 155 | "completeness": completeness, |
149 | "v-measure": v_measure, | 156 | "v-measure": v_measure, |
150 | "purity_class_score": purity_class_score, | 157 | "purity_class_score": purity_class_score, |
151 | "purity_cluster score": purity_cluster_score, | 158 | "purity_cluster score": purity_cluster_score, |
152 | "K": K | 159 | "K": K |
153 | } | 160 | } |
154 | 161 | ||
155 | 162 | ||
156 | if __name__ == "__main__": | 163 | if __name__ == "__main__": |
157 | # Argparse | 164 | # Argparse |
158 | parser = argparse.ArgumentParser("Compute clustering on a latent space") | 165 | parser = argparse.ArgumentParser("Compute clustering on a latent space") |
159 | parser.add_argument("features") | 166 | parser.add_argument("features") |
160 | parser.add_argument("utt2", | 167 | parser.add_argument("utt2", |
161 | type=str, | 168 | type=str, |
162 | help="file with [utt] [value]") | 169 | help="file with [utt] [value]") |
163 | parser.add_argument("--idsfrom", | 170 | parser.add_argument("--idsfrom", |
164 | type=str, | 171 | type=str, |
165 | default="utt2", | 172 | default="utt2", |
166 | choices=[ | 173 | choices=[ |
167 | "features", | 174 | "features", |
168 | "utt2" | 175 | "utt2" |
169 | ], | 176 | ], |
170 | help="from features or from utt2?") | 177 | help="from features or from utt2?") |
171 | parser.add_argument("--prefix", | 178 | parser.add_argument("--prefix", |
172 | default="", | 179 | default="", |
173 | type=str, | 180 | type=str, |
174 | help="prefix of saved files") | 181 | help="prefix of saved files") |
175 | parser.add_argument("--outdir", | 182 | parser.add_argument("--outdir", |
176 | default=None, | 183 | default=None, |
177 | type=str, | 184 | type=str, |
178 | help="Output directory") | 185 | help="Output directory") |
179 | parser.add_argument("--nmodels", | 186 | parser.add_argument("--nmodels", |
180 | type=int, | 187 | type=int, |
181 | default=1, | 188 | default=1, |
182 | help="specifies the number of models to train") | 189 | help="specifies the number of models to train") |
190 | parser.add_argument("--onlymeasures", | ||
191 | action='store_true', | ||
192 | help="Don't compute the clustering, compute only the measures") | ||
183 | args = parser.parse_args() | 193 | args = parser.parse_args() |
184 | 194 | ||
185 | assert args.outdir | 195 | assert args.outdir |
186 | 196 | ||
187 | start = time.time() | 197 | start = time.time() |
188 | 198 | ||
189 | # Load features and utt2 | 199 | # Load features and utt2 |
190 | features = read_features(args.features) | 200 | features = read_features(args.features) |
191 | utt2 = read_lst(args.utt2) | 201 | utt2 = read_lst(args.utt2) |
192 | 202 | ||
193 | # Take id list | 203 | # Take id list |
194 | if args.idsfrom == "features": | 204 | if args.idsfrom == "features": |
195 | ids = list(features.keys()) | 205 | ids = list(features.keys()) |
196 | elif args.idsfrom == "utt2": | 206 | elif args.idsfrom == "utt2": |
197 | ids = list(utt2.keys()) | 207 | ids = list(utt2.keys()) |
198 | else: | 208 | else: |
199 | print(f"idsfrom is not good: {args.idsfrom}") | 209 | print(f"idsfrom is not good: {args.idsfrom}") |
200 | exit(1) | 210 | exit(1) |
201 | 211 | ||
202 | feats = np.vstack([ features[id_] for id_ in ids ]) | 212 | feats = np.vstack([ features[id_] for id_ in ids ]) |
203 | classes = [ utt2[id_] for id_ in ids ] | 213 | classes = [ utt2[id_] for id_ in ids ] |
204 | 214 | ||
205 | # Encode labels | 215 | # Encode labels |
206 | le = LabelEncoder() | 216 | le = LabelEncoder() |
207 | labels = le.fit_transform(classes) | 217 | labels = le.fit_transform(classes) |
208 | 218 | ||
209 | measures = {} | 219 | measures = {} |
210 | for i in range(1, args.nmodels+1): | 220 | for i in range(1, args.nmodels+1): |
211 | subdir = os.path.join(args.outdir, str(i)) | 221 | subdir = os.path.join(args.outdir, str(i)) |
212 | if not os.path.exists(subdir): | 222 | if not os.path.exists(subdir): |
213 | os.mkdir(subdir) | 223 | os.mkdir(subdir) |
214 | print(f"[{i}/{args.nmodels}] => {subdir}") | 224 | print(f"[{i}/{args.nmodels}] => {subdir}") |
215 | results = train_clustering(le, feats, classes, subdir) | 225 | results = train_clustering(le, feats, classes, subdir) |
216 | 226 | ||
217 | for key, value in results.items(): | 227 | for key, value in results.items(): |
218 | if key not in measures: | 228 | if key not in measures: |
219 | measures[key] = [] | 229 | measures[key] = [] |
220 | measures[key].append(results[key]) | 230 | measures[key].append(results[key]) |
221 | 231 | ||
222 | 232 | ||
223 | # File with results | 233 | # File with results |
224 | file_results = os.path.join(args.outdir, args.prefix + "clustering_measures.txt") | 234 | file_results = os.path.join(args.outdir, args.prefix + "clustering_measures.txt") |
225 | 235 | ||
226 | with open(file_results, "w") as f: | 236 | with open(file_results, "w") as f: |
227 | f.write(f"[nmodels: {args.nmodels}]\n") | 237 | f.write(f"[nmodels: {args.nmodels}]\n") |
228 | for key in measures.keys(): | 238 | for key in measures.keys(): |
229 | values = np.asarray(measures[key], dtype=float) | 239 | values = np.asarray(measures[key], dtype=float) |
230 | mean = np.mean(values) | 240 | mean = np.mean(values) |
231 | std = np.std(values) | 241 | std = np.std(values) |
232 | f.write(f"[{key} => mean: {mean}, std: {std}] \n") | 242 | f.write(f"[{key} => mean: {mean}, std: {std}] \n") |
233 | 243 |