Blame view
scripts/evaluations/clustering.py
4.35 KB
e403ed5fb Add a script that... |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
''' This script allows the user to evaluate a classification system on new labels using clustering methods. The algorithms are applied on the given latent space (embedding). ''' import argparse import numpy as np import pandas as pd import os import time from sklearn.preprocessing import LabelEncoder from sklearn.metrics.pairwise import pairwise_distances from sklearn.metrics import f1_score from sklearn.cluster import KMeans from sklearn.manifold import TSNE import matplotlib.pyplot as plt from volia.data_io import read_features,read_lst if __name__ == "__main__": # Argparse parser = argparse.ArgumentParser("Compute clustering on a latent space") parser.add_argument("features") parser.add_argument("utt2", type=str, help="file with [utt] [value]") parser.add_argument("--prefix", type=str, help="prefix of saved files") parser.add_argument("--outdir", default=None, type=str, help="Output directory") args = parser.parse_args() assert args.outdir start = time.time() # Load features and utt2 features = read_features(args.features) utt2 = read_lst(args.utt2) ids = list(features.keys()) feats = np.vstack([ features[id_] for id_ in ids ]) classes = [ utt2[id_] for id_ in ids ] # Encode labels le = LabelEncoder() labels = le.fit_transform(classes) num_classes = len(le.classes_) # Compute KMEANS clustering on data estimator = KMeans( n_clusters=num_classes, n_init=100, tol=10-6, algorithm="elkan" ) estimator.fit(feats) print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}") # contains distance to each cluster for each sample dist_space = estimator.transform(feats) predictions = np.argmin(dist_space, axis=1) # gives each cluster a name (considering most represented character) dataframe = pd.DataFrame({ "label": pd.Series(list(map(lambda x: le.classes_[x], labels))), "prediction": pd.Series(predictions) }) def find_cluster_name_fn(c): mask = dataframe["prediction"] == c return dataframe[mask]["label"].value_counts(sort=False).idxmax() cluster_names = list(map(find_cluster_name_fn, range(num_classes))) predicted_labels = le.transform( [cluster_names[pred] for pred in predictions]) # F-measure fscores = f1_score(labels, predicted_labels, average=None) fscores_str = " ".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores)))) print(f"F1-scores for each classes: {fscores_str}") print(f"Global score : {np.mean(fscores)}") with open(os.path.join(args.outdir, args.prefix + "eval_clustering.log"), "w") as fd: print(f"F1-scores for each classes: {fscores_str}", file=fd) print(f"Global score : {np.mean(fscores)}", file=fd) # Process t-SNE and plot tsne_estimator = TSNE() embeddings = tsne_estimator.fit_transform(feats) print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format( tsne_estimator.n_iter_, tsne_estimator.kl_divergence_)) fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5)) for c, name in enumerate(le.classes_): c_mask = np.where(labels == c) axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) try: id_cluster = cluster_names.index(name) except ValueError: print("WARNING: no cluster found for {}".format(name)) continue c_mask = np.where(predictions == id_cluster) axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None) axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) axe1.set_title("true labels") axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35)) axe2.set_title("predicted cluster label") plt.suptitle("Kmeans Clustering") loc = os.path.join( args.outdir, args.prefix + "kmeans.pdf" ) plt.savefig(loc, bbox_inches="tight") plt.close() print("INFO: figure saved at {}".format(loc)) end = time.time() print("program ended in {0:.2f} seconds".format(end-start)) |