From 4ed3ebc7d7a764a8ea50a8a9c58d4edb83799c99 Mon Sep 17 00:00:00 2001 From: Mathias Date: Wed, 16 Sep 2020 17:12:16 +0200 Subject: [PATCH] Save results on a csv file now --- scripts/evaluations/clustering.py | 187 ++++++++++++++++++++++++++------------ 1 file changed, 131 insertions(+), 56 deletions(-) diff --git a/scripts/evaluations/clustering.py b/scripts/evaluations/clustering.py index 72293e2..8d509b3 100644 --- a/scripts/evaluations/clustering.py +++ b/scripts/evaluations/clustering.py @@ -8,65 +8,27 @@ import pandas as pd import os import time import pickle +import csv + from sklearn.preprocessing import LabelEncoder from sklearn.metrics.pairwise import pairwise_distances -from sklearn.metrics import f1_score from sklearn.cluster import KMeans from sklearn.manifold import TSNE +from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score import matplotlib.pyplot as plt from volia.data_io import read_features,read_lst +from volia.measures import entropy_score -if __name__ == "__main__": - # Argparse - parser = argparse.ArgumentParser("Compute clustering on a latent space") - parser.add_argument("features") - parser.add_argument("utt2", - type=str, - help="file with [utt] [value]") - parser.add_argument("--idsfrom", - type=str, - default="utt2", - choices=[ - "features", - "utt2" - ], - help="from features or from utt2?") - parser.add_argument("--prefix", - default="", - type=str, - help="prefix of saved files") - parser.add_argument("--outdir", - default=None, - type=str, - help="Output directory") - - args = parser.parse_args() - - assert args.outdir +''' +TODO: +- Add an option allowing the user to choose the number of +clustering to train in order to compute the average and the +''' - start = time.time() - # Load features and utt2 - features = read_features(args.features) - utt2 = read_lst(args.utt2) - - # Take id list - if args.idsfrom == "features": - ids = list(features.keys()) - elif args.idsfrom == "utt2": - ids = list(utt2.keys()) - else: - print(f"idsfrom is not good: {args.idsfrom}") - exit(1) - - feats = np.vstack([ features[id_] for id_ in ids ]) - classes = [ utt2[id_] for id_ in ids ] - - # Encode labels - le = LabelEncoder() - labels = le.fit_transform(classes) - num_classes = len(le.classes_) +def train_clustering(label_encoder, feats, classes, outdir): + num_classes = len(label_encoder.classes_) # Compute KMEANS clustering on data estimator = KMeans( @@ -78,7 +40,7 @@ if __name__ == "__main__": estimator.fit(feats) print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}") - with open(os.path.join(args.outdir, "kmeans.pkl"), "wb") as f: + with open(os.path.join(outdir, f"_kmeans.pkl"), "wb") as f: pickle.dump(estimator, f) # contains distance to each cluster for each sample @@ -102,12 +64,28 @@ if __name__ == "__main__": # F-measure fscores = f1_score(labels, predicted_labels, average=None) fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores)))) - print(f"F1-scores for each classes:\n{fscores_str}") - print(f"Global score : {np.mean(fscores)}") - with open(os.path.join(args.outdir, args.prefix + "eval_clustering.log"), "w") as fd: + + # Entropy + _, _, entropy = entropy_score(labels, predicted_labels) + + # Homogenity + homogeneity = homogeneity_score(labels, predicted_labels) + + # Completeness + completeness = completeness_score(labels, predicted_labels) + + # V-Measure + v_measure = v_measure_score(labels, predicted_labels) + + # Write results + with open(os.path.join(outdir, f"_" + args.prefix + "eval_clustering.log"), "w") as fd: print(f"F1-scores for each classes:\n{fscores_str}", file=fd) + print(f"Entropy: {entropy}", file=fd) print(f"Global score : {np.mean(fscores)}", file=fd) - + print(f"Homogeneity: {homogeneity}", file=fd) + print(f"completeness: {completeness}", file=fd) + print(f"v-measure: {v_measure}", file=fd) + # Process t-SNE and plot tsne_estimator = TSNE() embeddings = tsne_estimator.fit_transform(feats) @@ -135,7 +113,7 @@ if __name__ == "__main__": plt.suptitle("Kmeans Clustering") loc = os.path.join( - args.outdir, + outdir, args.prefix + "kmeans.pdf" ) plt.savefig(loc, bbox_inches="tight") @@ -144,4 +122,101 @@ if __name__ == "__main__": print("INFO: figure saved at {}".format(loc)) end = time.time() - print("program ended in {0:.2f} seconds".format(end-start)) \ No newline at end of file + print("program ended in {0:.2f} seconds".format(end-start)) + return { + "f1": np.mean(fscores), + "entropy": entropy, + "homogeneity": homogeneity, + "completeness": completeness, + "v-measure": v_measure + } + +if __name__ == "__main__": + # Argparse + parser = argparse.ArgumentParser("Compute clustering on a latent space") + parser.add_argument("features") + parser.add_argument("utt2", + type=str, + help="file with [utt] [value]") + parser.add_argument("--idsfrom", + type=str, + default="utt2", + choices=[ + "features", + "utt2" + ], + help="from features or from utt2?") + parser.add_argument("--prefix", + default="", + type=str, + help="prefix of saved files") + parser.add_argument("--outdir", + default=None, + type=str, + help="Output directory") + parser.add_argument("--nmodels", + type=int, + default=1, + help="specifies the number of models to train") + args = parser.parse_args() + + assert args.outdir + + start = time.time() + + # Load features and utt2 + features = read_features(args.features) + utt2 = read_lst(args.utt2) + + # Take id list + if args.idsfrom == "features": + ids = list(features.keys()) + elif args.idsfrom == "utt2": + ids = list(utt2.keys()) + else: + print(f"idsfrom is not good: {args.idsfrom}") + exit(1) + + feats = np.vstack([ features[id_] for id_ in ids ]) + classes = [ utt2[id_] for id_ in ids ] + + # Encode labels + le = LabelEncoder() + labels = le.fit_transform(classes) + + measures = {} + for i in range(1, args.nmodels+1): + subdir = os.path.join(args.outdir, str(i)) + if not os.path.exists(subdir): + os.mkdir(subdir) + print(f"[{i}/{args.nmodels}] => {subdir}") + results = train_clustering(le, feats, classes, subdir) + + for key, value in results.items(): + if key not in measures: + measures[key] = [] + measures[key].append(results[key]) + + + # File with results + file_results = os.path.join(args.outdir, "clustering_measures.txt") + + with open(file_results, "w") as f: + f.write(f"[nmodels: {args.nmodels}]\n") + for key in measures.keys(): + values = np.asarray(measures[key], dtype=float) + mean = np.mean(values) + std = np.std(values) + f.write(f"[{key} => mean: {mean}, std: {std}] \n") + + # CSV File with all the values + file_csv_measures = os.path.join(args.outdir, "clustering_measures.csv") + + with open(file_csv_measures, "w", newline="") as f: + writer = csv.writer(f, delimiter=",") + writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"]) + for key in measures.keys(): + values = np.asarray(measures[key], dtype=float) + mean = np.mean(values) + std = np.std(values) + writer.writerow([key] + list(values) + [mean] + [std]) \ No newline at end of file -- 1.8.2.3