Commit 4ed3ebc7d7a764a8ea50a8a9c58d4edb83799c99
1 parent
1f8612ebfd
Exists in
master
Save results on a csv file now
Showing 1 changed file with 131 additions and 56 deletions Side-by-side Diff
scripts/evaluations/clustering.py
| ... | ... | @@ -8,66 +8,28 @@ |
| 8 | 8 | import os |
| 9 | 9 | import time |
| 10 | 10 | import pickle |
| 11 | +import csv | |
| 12 | + | |
| 11 | 13 | from sklearn.preprocessing import LabelEncoder |
| 12 | 14 | from sklearn.metrics.pairwise import pairwise_distances |
| 13 | -from sklearn.metrics import f1_score | |
| 14 | 15 | from sklearn.cluster import KMeans |
| 15 | 16 | from sklearn.manifold import TSNE |
| 17 | +from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score | |
| 16 | 18 | import matplotlib.pyplot as plt |
| 17 | 19 | |
| 18 | 20 | from volia.data_io import read_features,read_lst |
| 21 | +from volia.measures import entropy_score | |
| 19 | 22 | |
| 20 | -if __name__ == "__main__": | |
| 21 | - # Argparse | |
| 22 | - parser = argparse.ArgumentParser("Compute clustering on a latent space") | |
| 23 | - parser.add_argument("features") | |
| 24 | - parser.add_argument("utt2", | |
| 25 | - type=str, | |
| 26 | - help="file with [utt] [value]") | |
| 27 | - parser.add_argument("--idsfrom", | |
| 28 | - type=str, | |
| 29 | - default="utt2", | |
| 30 | - choices=[ | |
| 31 | - "features", | |
| 32 | - "utt2" | |
| 33 | - ], | |
| 34 | - help="from features or from utt2?") | |
| 35 | - parser.add_argument("--prefix", | |
| 36 | - default="", | |
| 37 | - type=str, | |
| 38 | - help="prefix of saved files") | |
| 39 | - parser.add_argument("--outdir", | |
| 40 | - default=None, | |
| 41 | - type=str, | |
| 42 | - help="Output directory") | |
| 43 | - | |
| 44 | - args = parser.parse_args() | |
| 23 | +''' | |
| 24 | +TODO: | |
| 25 | +- Add an option allowing the user to choose the number of | |
| 26 | +clustering to train in order to compute the average and the | |
| 27 | +''' | |
| 45 | 28 | |
| 46 | - assert args.outdir | |
| 47 | 29 | |
| 48 | - start = time.time() | |
| 30 | +def train_clustering(label_encoder, feats, classes, outdir): | |
| 31 | + num_classes = len(label_encoder.classes_) | |
| 49 | 32 | |
| 50 | - # Load features and utt2 | |
| 51 | - features = read_features(args.features) | |
| 52 | - utt2 = read_lst(args.utt2) | |
| 53 | - | |
| 54 | - # Take id list | |
| 55 | - if args.idsfrom == "features": | |
| 56 | - ids = list(features.keys()) | |
| 57 | - elif args.idsfrom == "utt2": | |
| 58 | - ids = list(utt2.keys()) | |
| 59 | - else: | |
| 60 | - print(f"idsfrom is not good: {args.idsfrom}") | |
| 61 | - exit(1) | |
| 62 | - | |
| 63 | - feats = np.vstack([ features[id_] for id_ in ids ]) | |
| 64 | - classes = [ utt2[id_] for id_ in ids ] | |
| 65 | - | |
| 66 | - # Encode labels | |
| 67 | - le = LabelEncoder() | |
| 68 | - labels = le.fit_transform(classes) | |
| 69 | - num_classes = len(le.classes_) | |
| 70 | - | |
| 71 | 33 | # Compute KMEANS clustering on data |
| 72 | 34 | estimator = KMeans( |
| 73 | 35 | n_clusters=num_classes, |
| ... | ... | @@ -78,7 +40,7 @@ |
| 78 | 40 | estimator.fit(feats) |
| 79 | 41 | print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}") |
| 80 | 42 | |
| 81 | - with open(os.path.join(args.outdir, "kmeans.pkl"), "wb") as f: | |
| 43 | + with open(os.path.join(outdir, f"_kmeans.pkl"), "wb") as f: | |
| 82 | 44 | pickle.dump(estimator, f) |
| 83 | 45 | |
| 84 | 46 | # contains distance to each cluster for each sample |
| 85 | 47 | |
| 86 | 48 | |
| ... | ... | @@ -102,12 +64,28 @@ |
| 102 | 64 | # F-measure |
| 103 | 65 | fscores = f1_score(labels, predicted_labels, average=None) |
| 104 | 66 | fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores)))) |
| 105 | - print(f"F1-scores for each classes:\n{fscores_str}") | |
| 106 | - print(f"Global score : {np.mean(fscores)}") | |
| 107 | - with open(os.path.join(args.outdir, args.prefix + "eval_clustering.log"), "w") as fd: | |
| 67 | + | |
| 68 | + # Entropy | |
| 69 | + _, _, entropy = entropy_score(labels, predicted_labels) | |
| 70 | + | |
| 71 | + # Homogenity | |
| 72 | + homogeneity = homogeneity_score(labels, predicted_labels) | |
| 73 | + | |
| 74 | + # Completeness | |
| 75 | + completeness = completeness_score(labels, predicted_labels) | |
| 76 | + | |
| 77 | + # V-Measure | |
| 78 | + v_measure = v_measure_score(labels, predicted_labels) | |
| 79 | + | |
| 80 | + # Write results | |
| 81 | + with open(os.path.join(outdir, f"_" + args.prefix + "eval_clustering.log"), "w") as fd: | |
| 108 | 82 | print(f"F1-scores for each classes:\n{fscores_str}", file=fd) |
| 83 | + print(f"Entropy: {entropy}", file=fd) | |
| 109 | 84 | print(f"Global score : {np.mean(fscores)}", file=fd) |
| 110 | - | |
| 85 | + print(f"Homogeneity: {homogeneity}", file=fd) | |
| 86 | + print(f"completeness: {completeness}", file=fd) | |
| 87 | + print(f"v-measure: {v_measure}", file=fd) | |
| 88 | + | |
| 111 | 89 | # Process t-SNE and plot |
| 112 | 90 | tsne_estimator = TSNE() |
| 113 | 91 | embeddings = tsne_estimator.fit_transform(feats) |
| ... | ... | @@ -135,7 +113,7 @@ |
| 135 | 113 | plt.suptitle("Kmeans Clustering") |
| 136 | 114 | |
| 137 | 115 | loc = os.path.join( |
| 138 | - args.outdir, | |
| 116 | + outdir, | |
| 139 | 117 | args.prefix + "kmeans.pdf" |
| 140 | 118 | ) |
| 141 | 119 | plt.savefig(loc, bbox_inches="tight") |
| ... | ... | @@ -145,4 +123,101 @@ |
| 145 | 123 | |
| 146 | 124 | end = time.time() |
| 147 | 125 | print("program ended in {0:.2f} seconds".format(end-start)) |
| 126 | + return { | |
| 127 | + "f1": np.mean(fscores), | |
| 128 | + "entropy": entropy, | |
| 129 | + "homogeneity": homogeneity, | |
| 130 | + "completeness": completeness, | |
| 131 | + "v-measure": v_measure | |
| 132 | + } | |
| 133 | + | |
| 134 | +if __name__ == "__main__": | |
| 135 | + # Argparse | |
| 136 | + parser = argparse.ArgumentParser("Compute clustering on a latent space") | |
| 137 | + parser.add_argument("features") | |
| 138 | + parser.add_argument("utt2", | |
| 139 | + type=str, | |
| 140 | + help="file with [utt] [value]") | |
| 141 | + parser.add_argument("--idsfrom", | |
| 142 | + type=str, | |
| 143 | + default="utt2", | |
| 144 | + choices=[ | |
| 145 | + "features", | |
| 146 | + "utt2" | |
| 147 | + ], | |
| 148 | + help="from features or from utt2?") | |
| 149 | + parser.add_argument("--prefix", | |
| 150 | + default="", | |
| 151 | + type=str, | |
| 152 | + help="prefix of saved files") | |
| 153 | + parser.add_argument("--outdir", | |
| 154 | + default=None, | |
| 155 | + type=str, | |
| 156 | + help="Output directory") | |
| 157 | + parser.add_argument("--nmodels", | |
| 158 | + type=int, | |
| 159 | + default=1, | |
| 160 | + help="specifies the number of models to train") | |
| 161 | + args = parser.parse_args() | |
| 162 | + | |
| 163 | + assert args.outdir | |
| 164 | + | |
| 165 | + start = time.time() | |
| 166 | + | |
| 167 | + # Load features and utt2 | |
| 168 | + features = read_features(args.features) | |
| 169 | + utt2 = read_lst(args.utt2) | |
| 170 | + | |
| 171 | + # Take id list | |
| 172 | + if args.idsfrom == "features": | |
| 173 | + ids = list(features.keys()) | |
| 174 | + elif args.idsfrom == "utt2": | |
| 175 | + ids = list(utt2.keys()) | |
| 176 | + else: | |
| 177 | + print(f"idsfrom is not good: {args.idsfrom}") | |
| 178 | + exit(1) | |
| 179 | + | |
| 180 | + feats = np.vstack([ features[id_] for id_ in ids ]) | |
| 181 | + classes = [ utt2[id_] for id_ in ids ] | |
| 182 | + | |
| 183 | + # Encode labels | |
| 184 | + le = LabelEncoder() | |
| 185 | + labels = le.fit_transform(classes) | |
| 186 | + | |
| 187 | + measures = {} | |
| 188 | + for i in range(1, args.nmodels+1): | |
| 189 | + subdir = os.path.join(args.outdir, str(i)) | |
| 190 | + if not os.path.exists(subdir): | |
| 191 | + os.mkdir(subdir) | |
| 192 | + print(f"[{i}/{args.nmodels}] => {subdir}") | |
| 193 | + results = train_clustering(le, feats, classes, subdir) | |
| 194 | + | |
| 195 | + for key, value in results.items(): | |
| 196 | + if key not in measures: | |
| 197 | + measures[key] = [] | |
| 198 | + measures[key].append(results[key]) | |
| 199 | + | |
| 200 | + | |
| 201 | + # File with results | |
| 202 | + file_results = os.path.join(args.outdir, "clustering_measures.txt") | |
| 203 | + | |
| 204 | + with open(file_results, "w") as f: | |
| 205 | + f.write(f"[nmodels: {args.nmodels}]\n") | |
| 206 | + for key in measures.keys(): | |
| 207 | + values = np.asarray(measures[key], dtype=float) | |
| 208 | + mean = np.mean(values) | |
| 209 | + std = np.std(values) | |
| 210 | + f.write(f"[{key} => mean: {mean}, std: {std}] \n") | |
| 211 | + | |
| 212 | + # CSV File with all the values | |
| 213 | + file_csv_measures = os.path.join(args.outdir, "clustering_measures.csv") | |
| 214 | + | |
| 215 | + with open(file_csv_measures, "w", newline="") as f: | |
| 216 | + writer = csv.writer(f, delimiter=",") | |
| 217 | + writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"]) | |
| 218 | + for key in measures.keys(): | |
| 219 | + values = np.asarray(measures[key], dtype=float) | |
| 220 | + mean = np.mean(values) | |
| 221 | + std = np.std(values) | |
| 222 | + writer.writerow([key] + list(values) + [mean] + [std]) |