Quillot Mathias / volia

Blame view

scripts/evaluations/clustering.py 4.89 KB

e403ed5fb Mathias Add a script that...	1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25	''' This script allows the user to evaluate a classification system on new labels using clustering methods. The algorithms are applied on the given latent space (embedding). ''' import argparse import numpy as np import pandas as pd import os import time from sklearn.preprocessing import LabelEncoder from sklearn.metrics.pairwise import pairwise_distances from sklearn.metrics import f1_score from sklearn.cluster import KMeans from sklearn.manifold import TSNE import matplotlib.pyplot as plt from volia.data_io import read_features,read_lst if __name__ == "__main__": # Argparse parser = argparse.ArgumentParser("Compute clustering on a latent space") parser.add_argument("features") parser.add_argument("utt2", type=str, help="file with [utt] [value]")
0d218501a Mathias Add an option to ...	26 27 28 29 30 31 32 33	parser.add_argument("--idsfrom", type=str, default="utt2", choices=[ "features", "utt2" ], help="from features or from utt2?")
85d6f0944 Mathias Add default value...	34 35	parser.add_argument("--prefix", default="",
e403ed5fb Mathias Add a script that...	36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51	type=str, help="prefix of saved files") parser.add_argument("--outdir", default=None, type=str, help="Output directory") args = parser.parse_args() assert args.outdir start = time.time() # Load features and utt2 features = read_features(args.features) utt2 = read_lst(args.utt2)
0d218501a Mathias Add an option to ...	52 53 54 55 56 57 58 59 60	# Take id list if args.idsfrom == "features": ids = list(features.keys()) elif args.idsfrom == "utt2": ids = list(utt2.keys()) else: print(f"idsfrom is not good: {args.idsfrom}") exit(1)
e403ed5fb Mathias Add a script that...	61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145	feats = np.vstack([ features[id_] for id_ in ids ]) classes = [ utt2[id_] for id_ in ids ] # Encode labels le = LabelEncoder() labels = le.fit_transform(classes) num_classes = len(le.classes_) # Compute KMEANS clustering on data estimator = KMeans( n_clusters=num_classes, n_init=100, tol=10-6, algorithm="elkan" ) estimator.fit(feats) print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}") # contains distance to each cluster for each sample dist_space = estimator.transform(feats) predictions = np.argmin(dist_space, axis=1) # gives each cluster a name (considering most represented character) dataframe = pd.DataFrame({ "label": pd.Series(list(map(lambda x: le.classes_[x], labels))), "prediction": pd.Series(predictions) }) def find_cluster_name_fn(c): mask = dataframe["prediction"] == c return dataframe[mask]["label"].value_counts(sort=False).idxmax() cluster_names = list(map(find_cluster_name_fn, range(num_classes))) predicted_labels = le.transform( [cluster_names[pred] for pred in predictions]) # F-measure fscores = f1_score(labels, predicted_labels, average=None) fscores_str = " ".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores)))) print(f"F1-scores for each classes: {fscores_str}") print(f"Global score : {np.mean(fscores)}") with open(os.path.join(args.outdir, args.prefix + "eval_clustering.log"), "w") as fd: print(f"F1-scores for each classes: {fscores_str}", file=fd) print(f"Global score : {np.mean(fscores)}", file=fd) # Process t-SNE and plot tsne_estimator = TSNE() embeddings = tsne_estimator.fit_transform(feats) print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format( tsne_estimator.n_iter_, tsne_estimator.kl_divergence_)) fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5)) for c, name in enumerate(le.classes_): c_mask = np.where(labels == c) axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) try: id_cluster = cluster_names.index(name) except ValueError: print("WARNING: no cluster found for {}".format(name)) continue c_mask = np.where(predictions == id_cluster) axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) axe1.set_title("true labels") axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) axe2.set_title("predicted cluster label") plt.suptitle("Kmeans Clustering") loc = os.path.join( args.outdir, args.prefix + "kmeans.pdf" ) plt.savefig(loc, bbox_inches="tight") plt.close() print("INFO: figure saved at {}".format(loc)) end = time.time() print("program ended in {0:.2f} seconds".format(end-start))