Commit 4ed3ebc7d7a764a8ea50a8a9c58d4edb83799c99
1 parent
1f8612ebfd
Exists in
master
Save results on a csv file now
Showing 1 changed file with 131 additions and 56 deletions Side-by-side Diff
scripts/evaluations/clustering.py
... | ... | @@ -8,66 +8,28 @@ |
8 | 8 | import os |
9 | 9 | import time |
10 | 10 | import pickle |
11 | +import csv | |
12 | + | |
11 | 13 | from sklearn.preprocessing import LabelEncoder |
12 | 14 | from sklearn.metrics.pairwise import pairwise_distances |
13 | -from sklearn.metrics import f1_score | |
14 | 15 | from sklearn.cluster import KMeans |
15 | 16 | from sklearn.manifold import TSNE |
17 | +from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score | |
16 | 18 | import matplotlib.pyplot as plt |
17 | 19 | |
18 | 20 | from volia.data_io import read_features,read_lst |
21 | +from volia.measures import entropy_score | |
19 | 22 | |
20 | -if __name__ == "__main__": | |
21 | - # Argparse | |
22 | - parser = argparse.ArgumentParser("Compute clustering on a latent space") | |
23 | - parser.add_argument("features") | |
24 | - parser.add_argument("utt2", | |
25 | - type=str, | |
26 | - help="file with [utt] [value]") | |
27 | - parser.add_argument("--idsfrom", | |
28 | - type=str, | |
29 | - default="utt2", | |
30 | - choices=[ | |
31 | - "features", | |
32 | - "utt2" | |
33 | - ], | |
34 | - help="from features or from utt2?") | |
35 | - parser.add_argument("--prefix", | |
36 | - default="", | |
37 | - type=str, | |
38 | - help="prefix of saved files") | |
39 | - parser.add_argument("--outdir", | |
40 | - default=None, | |
41 | - type=str, | |
42 | - help="Output directory") | |
43 | - | |
44 | - args = parser.parse_args() | |
23 | +''' | |
24 | +TODO: | |
25 | +- Add an option allowing the user to choose the number of | |
26 | +clustering to train in order to compute the average and the | |
27 | +''' | |
45 | 28 | |
46 | - assert args.outdir | |
47 | 29 | |
48 | - start = time.time() | |
30 | +def train_clustering(label_encoder, feats, classes, outdir): | |
31 | + num_classes = len(label_encoder.classes_) | |
49 | 32 | |
50 | - # Load features and utt2 | |
51 | - features = read_features(args.features) | |
52 | - utt2 = read_lst(args.utt2) | |
53 | - | |
54 | - # Take id list | |
55 | - if args.idsfrom == "features": | |
56 | - ids = list(features.keys()) | |
57 | - elif args.idsfrom == "utt2": | |
58 | - ids = list(utt2.keys()) | |
59 | - else: | |
60 | - print(f"idsfrom is not good: {args.idsfrom}") | |
61 | - exit(1) | |
62 | - | |
63 | - feats = np.vstack([ features[id_] for id_ in ids ]) | |
64 | - classes = [ utt2[id_] for id_ in ids ] | |
65 | - | |
66 | - # Encode labels | |
67 | - le = LabelEncoder() | |
68 | - labels = le.fit_transform(classes) | |
69 | - num_classes = len(le.classes_) | |
70 | - | |
71 | 33 | # Compute KMEANS clustering on data |
72 | 34 | estimator = KMeans( |
73 | 35 | n_clusters=num_classes, |
... | ... | @@ -78,7 +40,7 @@ |
78 | 40 | estimator.fit(feats) |
79 | 41 | print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}") |
80 | 42 | |
81 | - with open(os.path.join(args.outdir, "kmeans.pkl"), "wb") as f: | |
43 | + with open(os.path.join(outdir, f"_kmeans.pkl"), "wb") as f: | |
82 | 44 | pickle.dump(estimator, f) |
83 | 45 | |
84 | 46 | # contains distance to each cluster for each sample |
85 | 47 | |
86 | 48 | |
... | ... | @@ -102,12 +64,28 @@ |
102 | 64 | # F-measure |
103 | 65 | fscores = f1_score(labels, predicted_labels, average=None) |
104 | 66 | fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores)))) |
105 | - print(f"F1-scores for each classes:\n{fscores_str}") | |
106 | - print(f"Global score : {np.mean(fscores)}") | |
107 | - with open(os.path.join(args.outdir, args.prefix + "eval_clustering.log"), "w") as fd: | |
67 | + | |
68 | + # Entropy | |
69 | + _, _, entropy = entropy_score(labels, predicted_labels) | |
70 | + | |
71 | + # Homogenity | |
72 | + homogeneity = homogeneity_score(labels, predicted_labels) | |
73 | + | |
74 | + # Completeness | |
75 | + completeness = completeness_score(labels, predicted_labels) | |
76 | + | |
77 | + # V-Measure | |
78 | + v_measure = v_measure_score(labels, predicted_labels) | |
79 | + | |
80 | + # Write results | |
81 | + with open(os.path.join(outdir, f"_" + args.prefix + "eval_clustering.log"), "w") as fd: | |
108 | 82 | print(f"F1-scores for each classes:\n{fscores_str}", file=fd) |
83 | + print(f"Entropy: {entropy}", file=fd) | |
109 | 84 | print(f"Global score : {np.mean(fscores)}", file=fd) |
110 | - | |
85 | + print(f"Homogeneity: {homogeneity}", file=fd) | |
86 | + print(f"completeness: {completeness}", file=fd) | |
87 | + print(f"v-measure: {v_measure}", file=fd) | |
88 | + | |
111 | 89 | # Process t-SNE and plot |
112 | 90 | tsne_estimator = TSNE() |
113 | 91 | embeddings = tsne_estimator.fit_transform(feats) |
... | ... | @@ -135,7 +113,7 @@ |
135 | 113 | plt.suptitle("Kmeans Clustering") |
136 | 114 | |
137 | 115 | loc = os.path.join( |
138 | - args.outdir, | |
116 | + outdir, | |
139 | 117 | args.prefix + "kmeans.pdf" |
140 | 118 | ) |
141 | 119 | plt.savefig(loc, bbox_inches="tight") |
... | ... | @@ -145,4 +123,101 @@ |
145 | 123 | |
146 | 124 | end = time.time() |
147 | 125 | print("program ended in {0:.2f} seconds".format(end-start)) |
126 | + return { | |
127 | + "f1": np.mean(fscores), | |
128 | + "entropy": entropy, | |
129 | + "homogeneity": homogeneity, | |
130 | + "completeness": completeness, | |
131 | + "v-measure": v_measure | |
132 | + } | |
133 | + | |
134 | +if __name__ == "__main__": | |
135 | + # Argparse | |
136 | + parser = argparse.ArgumentParser("Compute clustering on a latent space") | |
137 | + parser.add_argument("features") | |
138 | + parser.add_argument("utt2", | |
139 | + type=str, | |
140 | + help="file with [utt] [value]") | |
141 | + parser.add_argument("--idsfrom", | |
142 | + type=str, | |
143 | + default="utt2", | |
144 | + choices=[ | |
145 | + "features", | |
146 | + "utt2" | |
147 | + ], | |
148 | + help="from features or from utt2?") | |
149 | + parser.add_argument("--prefix", | |
150 | + default="", | |
151 | + type=str, | |
152 | + help="prefix of saved files") | |
153 | + parser.add_argument("--outdir", | |
154 | + default=None, | |
155 | + type=str, | |
156 | + help="Output directory") | |
157 | + parser.add_argument("--nmodels", | |
158 | + type=int, | |
159 | + default=1, | |
160 | + help="specifies the number of models to train") | |
161 | + args = parser.parse_args() | |
162 | + | |
163 | + assert args.outdir | |
164 | + | |
165 | + start = time.time() | |
166 | + | |
167 | + # Load features and utt2 | |
168 | + features = read_features(args.features) | |
169 | + utt2 = read_lst(args.utt2) | |
170 | + | |
171 | + # Take id list | |
172 | + if args.idsfrom == "features": | |
173 | + ids = list(features.keys()) | |
174 | + elif args.idsfrom == "utt2": | |
175 | + ids = list(utt2.keys()) | |
176 | + else: | |
177 | + print(f"idsfrom is not good: {args.idsfrom}") | |
178 | + exit(1) | |
179 | + | |
180 | + feats = np.vstack([ features[id_] for id_ in ids ]) | |
181 | + classes = [ utt2[id_] for id_ in ids ] | |
182 | + | |
183 | + # Encode labels | |
184 | + le = LabelEncoder() | |
185 | + labels = le.fit_transform(classes) | |
186 | + | |
187 | + measures = {} | |
188 | + for i in range(1, args.nmodels+1): | |
189 | + subdir = os.path.join(args.outdir, str(i)) | |
190 | + if not os.path.exists(subdir): | |
191 | + os.mkdir(subdir) | |
192 | + print(f"[{i}/{args.nmodels}] => {subdir}") | |
193 | + results = train_clustering(le, feats, classes, subdir) | |
194 | + | |
195 | + for key, value in results.items(): | |
196 | + if key not in measures: | |
197 | + measures[key] = [] | |
198 | + measures[key].append(results[key]) | |
199 | + | |
200 | + | |
201 | + # File with results | |
202 | + file_results = os.path.join(args.outdir, "clustering_measures.txt") | |
203 | + | |
204 | + with open(file_results, "w") as f: | |
205 | + f.write(f"[nmodels: {args.nmodels}]\n") | |
206 | + for key in measures.keys(): | |
207 | + values = np.asarray(measures[key], dtype=float) | |
208 | + mean = np.mean(values) | |
209 | + std = np.std(values) | |
210 | + f.write(f"[{key} => mean: {mean}, std: {std}] \n") | |
211 | + | |
212 | + # CSV File with all the values | |
213 | + file_csv_measures = os.path.join(args.outdir, "clustering_measures.csv") | |
214 | + | |
215 | + with open(file_csv_measures, "w", newline="") as f: | |
216 | + writer = csv.writer(f, delimiter=",") | |
217 | + writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"]) | |
218 | + for key in measures.keys(): | |
219 | + values = np.asarray(measures[key], dtype=float) | |
220 | + mean = np.mean(values) | |
221 | + std = np.std(values) | |
222 | + writer.writerow([key] + list(values) + [mean] + [std]) |