Commit 15b183a24d4fa0c2e97c2239e8a132470a6749b1
1 parent
503bfd9274
Exists in
master
Add purity measure to the script
Showing 1 changed file with 15 additions and 2 deletions Inline Diff
scripts/evaluations/clustering.py
1 | ''' | 1 | ''' |
2 | This script allows the user to evaluate a classification system on new labels using clustering methods. | 2 | This script allows the user to evaluate a classification system on new labels using clustering methods. |
3 | The algorithms are applied on the given latent space (embedding). | 3 | The algorithms are applied on the given latent space (embedding). |
4 | ''' | 4 | ''' |
5 | import argparse | 5 | import argparse |
6 | import numpy as np | 6 | import numpy as np |
7 | import pandas as pd | 7 | import pandas as pd |
8 | import os | 8 | import os |
9 | import time | 9 | import time |
10 | import pickle | 10 | import pickle |
11 | import csv | 11 | import csv |
12 | 12 | ||
13 | from sklearn.preprocessing import LabelEncoder | 13 | from sklearn.preprocessing import LabelEncoder |
14 | from sklearn.metrics.pairwise import pairwise_distances | 14 | from sklearn.metrics.pairwise import pairwise_distances |
15 | from sklearn.cluster import KMeans | 15 | from sklearn.cluster import KMeans |
16 | from sklearn.manifold import TSNE | 16 | from sklearn.manifold import TSNE |
17 | from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score | 17 | from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score |
18 | import matplotlib.pyplot as plt | 18 | import matplotlib.pyplot as plt |
19 | 19 | ||
20 | from volia.data_io import read_features,read_lst | 20 | from volia.data_io import read_features,read_lst |
21 | from volia.measures import entropy_score | 21 | from volia.measures import entropy_score, purity_score |
22 | 22 | ||
23 | ''' | 23 | ''' |
24 | TODO: | 24 | TODO: |
25 | - Add an option allowing the user to choose the number of | 25 | - Add an option allowing the user to choose the number of |
26 | clustering to train in order to compute the average and the | 26 | clustering to train in order to compute the average and the |
27 | ''' | 27 | ''' |
28 | 28 | ||
29 | 29 | ||
30 | def train_clustering(label_encoder, feats, classes, outdir): | 30 | def train_clustering(label_encoder, feats, classes, outdir): |
31 | num_classes = len(label_encoder.classes_) | 31 | num_classes = len(label_encoder.classes_) |
32 | 32 | ||
33 | # Compute KMEANS clustering on data | 33 | # Compute KMEANS clustering on data |
34 | estimator = KMeans( | 34 | estimator = KMeans( |
35 | n_clusters=num_classes, | 35 | n_clusters=num_classes, |
36 | n_init=100, | 36 | n_init=100, |
37 | tol=10-6, | 37 | tol=10-6, |
38 | algorithm="elkan" | 38 | algorithm="elkan" |
39 | ) | 39 | ) |
40 | estimator.fit(feats) | 40 | estimator.fit(feats) |
41 | print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}") | 41 | print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}") |
42 | 42 | ||
43 | with open(os.path.join(outdir, f"_kmeans.pkl"), "wb") as f: | 43 | with open(os.path.join(outdir, f"_kmeans.pkl"), "wb") as f: |
44 | pickle.dump(estimator, f) | 44 | pickle.dump(estimator, f) |
45 | 45 | ||
46 | # contains distance to each cluster for each sample | 46 | # contains distance to each cluster for each sample |
47 | dist_space = estimator.transform(feats) | 47 | dist_space = estimator.transform(feats) |
48 | predictions = np.argmin(dist_space, axis=1) | 48 | predictions = np.argmin(dist_space, axis=1) |
49 | 49 | ||
50 | # gives each cluster a name (considering most represented character) | 50 | # gives each cluster a name (considering most represented character) |
51 | dataframe = pd.DataFrame({ | 51 | dataframe = pd.DataFrame({ |
52 | "label": pd.Series(list(map(lambda x: le.classes_[x], labels))), | 52 | "label": pd.Series(list(map(lambda x: le.classes_[x], labels))), |
53 | "prediction": pd.Series(predictions) | 53 | "prediction": pd.Series(predictions) |
54 | }) | 54 | }) |
55 | 55 | ||
56 | def find_cluster_name_fn(c): | 56 | def find_cluster_name_fn(c): |
57 | mask = dataframe["prediction"] == c | 57 | mask = dataframe["prediction"] == c |
58 | return dataframe[mask]["label"].value_counts(sort=False).idxmax() | 58 | return dataframe[mask]["label"].value_counts(sort=False).idxmax() |
59 | 59 | ||
60 | cluster_names = list(map(find_cluster_name_fn, range(num_classes))) | 60 | cluster_names = list(map(find_cluster_name_fn, range(num_classes))) |
61 | predicted_labels = le.transform( | 61 | predicted_labels = le.transform( |
62 | [cluster_names[pred] for pred in predictions]) | 62 | [cluster_names[pred] for pred in predictions]) |
63 | 63 | ||
64 | # F-measure | 64 | # F-measure |
65 | fscores = f1_score(labels, predicted_labels, average=None) | 65 | fscores = f1_score(labels, predicted_labels, average=None) |
66 | fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores)))) | 66 | fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores)))) |
67 | 67 | ||
68 | # Entropy | 68 | # Entropy |
69 | _, _, entropy = entropy_score(labels, predicted_labels) | 69 | _, _, entropy = entropy_score(labels, predicted_labels) |
70 | 70 | ||
71 | # Homogenity | 71 | # Homogenity |
72 | homogeneity = homogeneity_score(labels, predicted_labels) | 72 | homogeneity = homogeneity_score(labels, predicted_labels) |
73 | 73 | ||
74 | # Completeness | 74 | # Completeness |
75 | completeness = completeness_score(labels, predicted_labels) | 75 | completeness = completeness_score(labels, predicted_labels) |
76 | 76 | ||
77 | # V-Measure | 77 | # V-Measure |
78 | v_measure = v_measure_score(labels, predicted_labels) | 78 | v_measure = v_measure_score(labels, predicted_labels) |
79 | 79 | ||
80 | # Purity | ||
81 | purity_scores = purity_score(labels, predicted_labels) | ||
82 | purity_class_score = purity_scores["purity_class_score"] | ||
83 | purity_cluster_score = purity_scores["purity_cluster_score"] | ||
84 | K = purity_scores["K"] | ||
85 | |||
80 | # Write results | 86 | # Write results |
81 | with open(os.path.join(outdir, f"_" + args.prefix + "eval_clustering.log"), "w") as fd: | 87 | with open(os.path.join(outdir, f"_" + args.prefix + "eval_clustering.log"), "w") as fd: |
82 | print(f"F1-scores for each classes:\n{fscores_str}", file=fd) | 88 | print(f"F1-scores for each classes:\n{fscores_str}", file=fd) |
83 | print(f"Entropy: {entropy}", file=fd) | 89 | print(f"Entropy: {entropy}", file=fd) |
84 | print(f"Global score : {np.mean(fscores)}", file=fd) | 90 | print(f"Global score : {np.mean(fscores)}", file=fd) |
85 | print(f"Homogeneity: {homogeneity}", file=fd) | 91 | print(f"Homogeneity: {homogeneity}", file=fd) |
86 | print(f"completeness: {completeness}", file=fd) | 92 | print(f"completeness: {completeness}", file=fd) |
87 | print(f"v-measure: {v_measure}", file=fd) | 93 | print(f"v-measure: {v_measure}", file=fd) |
94 | print(f"purity class score: {purity_class_score}", file=fd) | ||
95 | print(f"purity cluster score: {purity_cluster_score}", file=fd) | ||
96 | print(f"purity overall evaluation criterion (K): {K}", file=fd) | ||
88 | 97 | ||
89 | # Process t-SNE and plot | 98 | # Process t-SNE and plot |
90 | tsne_estimator = TSNE() | 99 | tsne_estimator = TSNE() |
91 | embeddings = tsne_estimator.fit_transform(feats) | 100 | embeddings = tsne_estimator.fit_transform(feats) |
92 | print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format( | 101 | print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format( |
93 | tsne_estimator.n_iter_, tsne_estimator.kl_divergence_)) | 102 | tsne_estimator.n_iter_, tsne_estimator.kl_divergence_)) |
94 | 103 | ||
95 | fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5)) | 104 | fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5)) |
96 | for c, name in enumerate(le.classes_): | 105 | for c, name in enumerate(le.classes_): |
97 | c_mask = np.where(labels == c) | 106 | c_mask = np.where(labels == c) |
98 | axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) | 107 | axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) |
99 | 108 | ||
100 | try: | 109 | try: |
101 | id_cluster = cluster_names.index(name) | 110 | id_cluster = cluster_names.index(name) |
102 | except ValueError: | 111 | except ValueError: |
103 | print("WARNING: no cluster found for {}".format(name)) | 112 | print("WARNING: no cluster found for {}".format(name)) |
104 | continue | 113 | continue |
105 | c_mask = np.where(predictions == id_cluster) | 114 | c_mask = np.where(predictions == id_cluster) |
106 | axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) | 115 | axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) |
107 | 116 | ||
108 | axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) | 117 | axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) |
109 | axe1.set_title("true labels") | 118 | axe1.set_title("true labels") |
110 | axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) | 119 | axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) |
111 | axe2.set_title("predicted cluster label") | 120 | axe2.set_title("predicted cluster label") |
112 | 121 | ||
113 | plt.suptitle("Kmeans Clustering") | 122 | plt.suptitle("Kmeans Clustering") |
114 | 123 | ||
115 | loc = os.path.join( | 124 | loc = os.path.join( |
116 | outdir, | 125 | outdir, |
117 | args.prefix + "kmeans.pdf" | 126 | args.prefix + "kmeans.pdf" |
118 | ) | 127 | ) |
119 | plt.savefig(loc, bbox_inches="tight") | 128 | plt.savefig(loc, bbox_inches="tight") |
120 | plt.close() | 129 | plt.close() |
121 | 130 | ||
122 | print("INFO: figure saved at {}".format(loc)) | 131 | print("INFO: figure saved at {}".format(loc)) |
123 | 132 | ||
124 | end = time.time() | 133 | end = time.time() |
125 | print("program ended in {0:.2f} seconds".format(end-start)) | 134 | print("program ended in {0:.2f} seconds".format(end-start)) |
126 | return { | 135 | return { |
127 | "f1": np.mean(fscores), | 136 | "f1": np.mean(fscores), |
128 | "entropy": entropy, | 137 | "entropy": entropy, |
129 | "homogeneity": homogeneity, | 138 | "homogeneity": homogeneity, |
130 | "completeness": completeness, | 139 | "completeness": completeness, |
131 | "v-measure": v_measure | 140 | "v-measure": v_measure, |
141 | "purity_class_score": purity_class_score, | ||
142 | "purity_cluster score": purity_cluster_score, | ||
143 | "K": K | ||
132 | } | 144 | } |
145 | |||
133 | 146 | ||
134 | if __name__ == "__main__": | 147 | if __name__ == "__main__": |
135 | # Argparse | 148 | # Argparse |
136 | parser = argparse.ArgumentParser("Compute clustering on a latent space") | 149 | parser = argparse.ArgumentParser("Compute clustering on a latent space") |
137 | parser.add_argument("features") | 150 | parser.add_argument("features") |
138 | parser.add_argument("utt2", | 151 | parser.add_argument("utt2", |
139 | type=str, | 152 | type=str, |
140 | help="file with [utt] [value]") | 153 | help="file with [utt] [value]") |
141 | parser.add_argument("--idsfrom", | 154 | parser.add_argument("--idsfrom", |
142 | type=str, | 155 | type=str, |
143 | default="utt2", | 156 | default="utt2", |
144 | choices=[ | 157 | choices=[ |
145 | "features", | 158 | "features", |
146 | "utt2" | 159 | "utt2" |
147 | ], | 160 | ], |
148 | help="from features or from utt2?") | 161 | help="from features or from utt2?") |
149 | parser.add_argument("--prefix", | 162 | parser.add_argument("--prefix", |
150 | default="", | 163 | default="", |
151 | type=str, | 164 | type=str, |
152 | help="prefix of saved files") | 165 | help="prefix of saved files") |
153 | parser.add_argument("--outdir", | 166 | parser.add_argument("--outdir", |
154 | default=None, | 167 | default=None, |
155 | type=str, | 168 | type=str, |
156 | help="Output directory") | 169 | help="Output directory") |
157 | parser.add_argument("--nmodels", | 170 | parser.add_argument("--nmodels", |
158 | type=int, | 171 | type=int, |
159 | default=1, | 172 | default=1, |
160 | help="specifies the number of models to train") | 173 | help="specifies the number of models to train") |
161 | args = parser.parse_args() | 174 | args = parser.parse_args() |
162 | 175 | ||
163 | assert args.outdir | 176 | assert args.outdir |
164 | 177 | ||
165 | start = time.time() | 178 | start = time.time() |
166 | 179 | ||
167 | # Load features and utt2 | 180 | # Load features and utt2 |
168 | features = read_features(args.features) | 181 | features = read_features(args.features) |
169 | utt2 = read_lst(args.utt2) | 182 | utt2 = read_lst(args.utt2) |
170 | 183 | ||
171 | # Take id list | 184 | # Take id list |
172 | if args.idsfrom == "features": | 185 | if args.idsfrom == "features": |
173 | ids = list(features.keys()) | 186 | ids = list(features.keys()) |
174 | elif args.idsfrom == "utt2": | 187 | elif args.idsfrom == "utt2": |
175 | ids = list(utt2.keys()) | 188 | ids = list(utt2.keys()) |
176 | else: | 189 | else: |
177 | print(f"idsfrom is not good: {args.idsfrom}") | 190 | print(f"idsfrom is not good: {args.idsfrom}") |
178 | exit(1) | 191 | exit(1) |
179 | 192 | ||
180 | feats = np.vstack([ features[id_] for id_ in ids ]) | 193 | feats = np.vstack([ features[id_] for id_ in ids ]) |
181 | classes = [ utt2[id_] for id_ in ids ] | 194 | classes = [ utt2[id_] for id_ in ids ] |
182 | 195 | ||
183 | # Encode labels | 196 | # Encode labels |
184 | le = LabelEncoder() | 197 | le = LabelEncoder() |
185 | labels = le.fit_transform(classes) | 198 | labels = le.fit_transform(classes) |
186 | 199 | ||
187 | measures = {} | 200 | measures = {} |
188 | for i in range(1, args.nmodels+1): | 201 | for i in range(1, args.nmodels+1): |
189 | subdir = os.path.join(args.outdir, str(i)) | 202 | subdir = os.path.join(args.outdir, str(i)) |
190 | if not os.path.exists(subdir): | 203 | if not os.path.exists(subdir): |
191 | os.mkdir(subdir) | 204 | os.mkdir(subdir) |
192 | print(f"[{i}/{args.nmodels}] => {subdir}") | 205 | print(f"[{i}/{args.nmodels}] => {subdir}") |
193 | results = train_clustering(le, feats, classes, subdir) | 206 | results = train_clustering(le, feats, classes, subdir) |
194 | 207 | ||
195 | for key, value in results.items(): | 208 | for key, value in results.items(): |
196 | if key not in measures: | 209 | if key not in measures: |
197 | measures[key] = [] | 210 | measures[key] = [] |
198 | measures[key].append(results[key]) | 211 | measures[key].append(results[key]) |
199 | 212 | ||
200 | 213 | ||
201 | # File with results | 214 | # File with results |
202 | file_results = os.path.join(args.outdir, "clustering_measures.txt") | 215 | file_results = os.path.join(args.outdir, "clustering_measures.txt") |
203 | 216 | ||
204 | with open(file_results, "w") as f: | 217 | with open(file_results, "w") as f: |
205 | f.write(f"[nmodels: {args.nmodels}]\n") | 218 | f.write(f"[nmodels: {args.nmodels}]\n") |
206 | for key in measures.keys(): | 219 | for key in measures.keys(): |
207 | values = np.asarray(measures[key], dtype=float) | 220 | values = np.asarray(measures[key], dtype=float) |
208 | mean = np.mean(values) | 221 | mean = np.mean(values) |
209 | std = np.std(values) | 222 | std = np.std(values) |
210 | f.write(f"[{key} => mean: {mean}, std: {std}] \n") | 223 | f.write(f"[{key} => mean: {mean}, std: {std}] \n") |
211 | 224 | ||
212 | # CSV File with all the values | 225 | # CSV File with all the values |
213 | file_csv_measures = os.path.join(args.outdir, "clustering_measures.csv") | 226 | file_csv_measures = os.path.join(args.outdir, "clustering_measures.csv") |
214 | 227 | ||
215 | with open(file_csv_measures, "w", newline="") as f: | 228 | with open(file_csv_measures, "w", newline="") as f: |
216 | writer = csv.writer(f, delimiter=",") | 229 | writer = csv.writer(f, delimiter=",") |
217 | writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"]) | 230 | writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"]) |
218 | for key in measures.keys(): | 231 | for key in measures.keys(): |
219 | values = np.asarray(measures[key], dtype=float) | 232 | values = np.asarray(measures[key], dtype=float) |
220 | mean = np.mean(values) | 233 | mean = np.mean(values) |
221 | std = np.std(values) | 234 | std = np.std(values) |
222 | writer.writerow([key] + list(values) + [mean] + [std]) | 235 | writer.writerow([key] + list(values) + [mean] + [std]) |