Commit 4ed3ebc7d7a764a8ea50a8a9c58d4edb83799c99
1 parent
1f8612ebfd
Exists in
master
Save results on a csv file now
Showing 1 changed file with 131 additions and 56 deletions Inline Diff
scripts/evaluations/clustering.py
1 | ''' | 1 | ''' |
2 | This script allows the user to evaluate a classification system on new labels using clustering methods. | 2 | This script allows the user to evaluate a classification system on new labels using clustering methods. |
3 | The algorithms are applied on the given latent space (embedding). | 3 | The algorithms are applied on the given latent space (embedding). |
4 | ''' | 4 | ''' |
5 | import argparse | 5 | import argparse |
6 | import numpy as np | 6 | import numpy as np |
7 | import pandas as pd | 7 | import pandas as pd |
8 | import os | 8 | import os |
9 | import time | 9 | import time |
10 | import pickle | 10 | import pickle |
11 | import csv | ||
12 | |||
11 | from sklearn.preprocessing import LabelEncoder | 13 | from sklearn.preprocessing import LabelEncoder |
12 | from sklearn.metrics.pairwise import pairwise_distances | 14 | from sklearn.metrics.pairwise import pairwise_distances |
13 | from sklearn.metrics import f1_score | ||
14 | from sklearn.cluster import KMeans | 15 | from sklearn.cluster import KMeans |
15 | from sklearn.manifold import TSNE | 16 | from sklearn.manifold import TSNE |
17 | from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score | ||
16 | import matplotlib.pyplot as plt | 18 | import matplotlib.pyplot as plt |
17 | 19 | ||
18 | from volia.data_io import read_features,read_lst | 20 | from volia.data_io import read_features,read_lst |
21 | from volia.measures import entropy_score | ||
19 | 22 | ||
20 | if __name__ == "__main__": | 23 | ''' |
21 | # Argparse | 24 | TODO: |
22 | parser = argparse.ArgumentParser("Compute clustering on a latent space") | 25 | - Add an option allowing the user to choose the number of |
23 | parser.add_argument("features") | 26 | clustering to train in order to compute the average and the |
24 | parser.add_argument("utt2", | 27 | ''' |
25 | type=str, | ||
26 | help="file with [utt] [value]") | ||
27 | parser.add_argument("--idsfrom", | ||
28 | type=str, | ||
29 | default="utt2", | ||
30 | choices=[ | ||
31 | "features", | ||
32 | "utt2" | ||
33 | ], | ||
34 | help="from features or from utt2?") | ||
35 | parser.add_argument("--prefix", | ||
36 | default="", | ||
37 | type=str, | ||
38 | help="prefix of saved files") | ||
39 | parser.add_argument("--outdir", | ||
40 | default=None, | ||
41 | type=str, | ||
42 | help="Output directory") | ||
43 | |||
44 | args = parser.parse_args() | ||
45 | 28 | ||
46 | assert args.outdir | ||
47 | 29 | ||
48 | start = time.time() | 30 | def train_clustering(label_encoder, feats, classes, outdir): |
31 | num_classes = len(label_encoder.classes_) | ||
49 | 32 | ||
50 | # Load features and utt2 | ||
51 | features = read_features(args.features) | ||
52 | utt2 = read_lst(args.utt2) | ||
53 | |||
54 | # Take id list | ||
55 | if args.idsfrom == "features": | ||
56 | ids = list(features.keys()) | ||
57 | elif args.idsfrom == "utt2": | ||
58 | ids = list(utt2.keys()) | ||
59 | else: | ||
60 | print(f"idsfrom is not good: {args.idsfrom}") | ||
61 | exit(1) | ||
62 | |||
63 | feats = np.vstack([ features[id_] for id_ in ids ]) | ||
64 | classes = [ utt2[id_] for id_ in ids ] | ||
65 | |||
66 | # Encode labels | ||
67 | le = LabelEncoder() | ||
68 | labels = le.fit_transform(classes) | ||
69 | num_classes = len(le.classes_) | ||
70 | |||
71 | # Compute KMEANS clustering on data | 33 | # Compute KMEANS clustering on data |
72 | estimator = KMeans( | 34 | estimator = KMeans( |
73 | n_clusters=num_classes, | 35 | n_clusters=num_classes, |
74 | n_init=100, | 36 | n_init=100, |
75 | tol=10-6, | 37 | tol=10-6, |
76 | algorithm="elkan" | 38 | algorithm="elkan" |
77 | ) | 39 | ) |
78 | estimator.fit(feats) | 40 | estimator.fit(feats) |
79 | print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}") | 41 | print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}") |
80 | 42 | ||
81 | with open(os.path.join(args.outdir, "kmeans.pkl"), "wb") as f: | 43 | with open(os.path.join(outdir, f"_kmeans.pkl"), "wb") as f: |
82 | pickle.dump(estimator, f) | 44 | pickle.dump(estimator, f) |
83 | 45 | ||
84 | # contains distance to each cluster for each sample | 46 | # contains distance to each cluster for each sample |
85 | dist_space = estimator.transform(feats) | 47 | dist_space = estimator.transform(feats) |
86 | predictions = np.argmin(dist_space, axis=1) | 48 | predictions = np.argmin(dist_space, axis=1) |
87 | 49 | ||
88 | # gives each cluster a name (considering most represented character) | 50 | # gives each cluster a name (considering most represented character) |
89 | dataframe = pd.DataFrame({ | 51 | dataframe = pd.DataFrame({ |
90 | "label": pd.Series(list(map(lambda x: le.classes_[x], labels))), | 52 | "label": pd.Series(list(map(lambda x: le.classes_[x], labels))), |
91 | "prediction": pd.Series(predictions) | 53 | "prediction": pd.Series(predictions) |
92 | }) | 54 | }) |
93 | 55 | ||
94 | def find_cluster_name_fn(c): | 56 | def find_cluster_name_fn(c): |
95 | mask = dataframe["prediction"] == c | 57 | mask = dataframe["prediction"] == c |
96 | return dataframe[mask]["label"].value_counts(sort=False).idxmax() | 58 | return dataframe[mask]["label"].value_counts(sort=False).idxmax() |
97 | 59 | ||
98 | cluster_names = list(map(find_cluster_name_fn, range(num_classes))) | 60 | cluster_names = list(map(find_cluster_name_fn, range(num_classes))) |
99 | predicted_labels = le.transform( | 61 | predicted_labels = le.transform( |
100 | [cluster_names[pred] for pred in predictions]) | 62 | [cluster_names[pred] for pred in predictions]) |
101 | 63 | ||
102 | # F-measure | 64 | # F-measure |
103 | fscores = f1_score(labels, predicted_labels, average=None) | 65 | fscores = f1_score(labels, predicted_labels, average=None) |
104 | fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores)))) | 66 | fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores)))) |
105 | print(f"F1-scores for each classes:\n{fscores_str}") | 67 | |
106 | print(f"Global score : {np.mean(fscores)}") | 68 | # Entropy |
107 | with open(os.path.join(args.outdir, args.prefix + "eval_clustering.log"), "w") as fd: | 69 | _, _, entropy = entropy_score(labels, predicted_labels) |
70 | |||
71 | # Homogenity | ||
72 | homogeneity = homogeneity_score(labels, predicted_labels) | ||
73 | |||
74 | # Completeness | ||
75 | completeness = completeness_score(labels, predicted_labels) | ||
76 | |||
77 | # V-Measure | ||
78 | v_measure = v_measure_score(labels, predicted_labels) | ||
79 | |||
80 | # Write results | ||
81 | with open(os.path.join(outdir, f"_" + args.prefix + "eval_clustering.log"), "w") as fd: | ||
108 | print(f"F1-scores for each classes:\n{fscores_str}", file=fd) | 82 | print(f"F1-scores for each classes:\n{fscores_str}", file=fd) |
83 | print(f"Entropy: {entropy}", file=fd) | ||
109 | print(f"Global score : {np.mean(fscores)}", file=fd) | 84 | print(f"Global score : {np.mean(fscores)}", file=fd) |
110 | 85 | print(f"Homogeneity: {homogeneity}", file=fd) | |
86 | print(f"completeness: {completeness}", file=fd) | ||
87 | print(f"v-measure: {v_measure}", file=fd) | ||
88 | |||
111 | # Process t-SNE and plot | 89 | # Process t-SNE and plot |
112 | tsne_estimator = TSNE() | 90 | tsne_estimator = TSNE() |
113 | embeddings = tsne_estimator.fit_transform(feats) | 91 | embeddings = tsne_estimator.fit_transform(feats) |
114 | print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format( | 92 | print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format( |
115 | tsne_estimator.n_iter_, tsne_estimator.kl_divergence_)) | 93 | tsne_estimator.n_iter_, tsne_estimator.kl_divergence_)) |
116 | 94 | ||
117 | fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5)) | 95 | fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5)) |
118 | for c, name in enumerate(le.classes_): | 96 | for c, name in enumerate(le.classes_): |
119 | c_mask = np.where(labels == c) | 97 | c_mask = np.where(labels == c) |
120 | axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) | 98 | axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) |
121 | 99 | ||
122 | try: | 100 | try: |
123 | id_cluster = cluster_names.index(name) | 101 | id_cluster = cluster_names.index(name) |
124 | except ValueError: | 102 | except ValueError: |
125 | print("WARNING: no cluster found for {}".format(name)) | 103 | print("WARNING: no cluster found for {}".format(name)) |
126 | continue | 104 | continue |
127 | c_mask = np.where(predictions == id_cluster) | 105 | c_mask = np.where(predictions == id_cluster) |
128 | axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) | 106 | axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) |
129 | 107 | ||
130 | axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) | 108 | axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) |
131 | axe1.set_title("true labels") | 109 | axe1.set_title("true labels") |
132 | axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) | 110 | axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) |
133 | axe2.set_title("predicted cluster label") | 111 | axe2.set_title("predicted cluster label") |
134 | 112 | ||
135 | plt.suptitle("Kmeans Clustering") | 113 | plt.suptitle("Kmeans Clustering") |
136 | 114 | ||
137 | loc = os.path.join( | 115 | loc = os.path.join( |
138 | args.outdir, | 116 | outdir, |
139 | args.prefix + "kmeans.pdf" | 117 | args.prefix + "kmeans.pdf" |
140 | ) | 118 | ) |
141 | plt.savefig(loc, bbox_inches="tight") | 119 | plt.savefig(loc, bbox_inches="tight") |
142 | plt.close() | 120 | plt.close() |
143 | 121 | ||
144 | print("INFO: figure saved at {}".format(loc)) | 122 | print("INFO: figure saved at {}".format(loc)) |
145 | 123 | ||
146 | end = time.time() | 124 | end = time.time() |
147 | print("program ended in {0:.2f} seconds".format(end-start)) | 125 | print("program ended in {0:.2f} seconds".format(end-start)) |
126 | return { | ||
127 | "f1": np.mean(fscores), | ||
128 | "entropy": entropy, | ||
129 | "homogeneity": homogeneity, | ||
130 | "completeness": completeness, | ||
131 | "v-measure": v_measure | ||
132 | } | ||
133 | |||
134 | if __name__ == "__main__": | ||
135 | # Argparse | ||
136 | parser = argparse.ArgumentParser("Compute clustering on a latent space") | ||
137 | parser.add_argument("features") | ||
138 | parser.add_argument("utt2", | ||
139 | type=str, | ||
140 | help="file with [utt] [value]") | ||
141 | parser.add_argument("--idsfrom", | ||
142 | type=str, | ||
143 | default="utt2", | ||
144 | choices=[ | ||
145 | "features", | ||
146 | "utt2" | ||
147 | ], | ||
148 | help="from features or from utt2?") | ||
149 | parser.add_argument("--prefix", | ||
150 | default="", | ||
151 | type=str, | ||
152 | help="prefix of saved files") | ||
153 | parser.add_argument("--outdir", | ||
154 | default=None, | ||
155 | type=str, | ||
156 | help="Output directory") | ||
157 | parser.add_argument("--nmodels", | ||
158 | type=int, | ||
159 | default=1, | ||
160 | help="specifies the number of models to train") | ||
161 | args = parser.parse_args() | ||
162 | |||
163 | assert args.outdir | ||
164 | |||
165 | start = time.time() | ||
166 | |||
167 | # Load features and utt2 | ||
168 | features = read_features(args.features) | ||
169 | utt2 = read_lst(args.utt2) | ||
170 | |||
171 | # Take id list | ||
172 | if args.idsfrom == "features": | ||
173 | ids = list(features.keys()) | ||
174 | elif args.idsfrom == "utt2": | ||
175 | ids = list(utt2.keys()) | ||
176 | else: | ||
177 | print(f"idsfrom is not good: {args.idsfrom}") | ||
178 | exit(1) | ||
179 |