Quillot Mathias / volia

Browse Code »

Commit e403ed5fb6202dae56d47815d5961cced00f1c85

Authored by Mathias 2020-09-14 14:40:53 +0200

1 parent 11ee97e2cc

Exists in master

Add a script that allow user to evaluate a representation using classification labels.

Showing 1 changed file with 126 additions and 0 deletions Side-by-side Diff

scripts/evaluations/clustering.py

scripts/evaluations/clustering.py

Diff comments View file @ e403ed5

	1	+'''
	2	+This script allows the user to evaluate a classification system on new labels using clustering methods.
	3	+The algorithms are applied on the given latent space (embedding).
	4	+'''
	5	+import argparse
	6	+import numpy as np
	7	+import pandas as pd
	8	+import os
	9	+import time
	10	+from sklearn.preprocessing import LabelEncoder
	11	+from sklearn.metrics.pairwise import pairwise_distances
	12	+from sklearn.metrics import f1_score
	13	+from sklearn.cluster import KMeans
	14	+from sklearn.manifold import TSNE
	15	+import matplotlib.pyplot as plt
	16	+
	17	+from volia.data_io import read_features,read_lst
	18	+
	19	+if __name__ == "__main__":
	20	+ # Argparse
	21	+ parser = argparse.ArgumentParser("Compute clustering on a latent space")
	22	+ parser.add_argument("features")
	23	+ parser.add_argument("utt2",
	24	+ type=str,
	25	+ help="file with [utt] [value]")
	26	+ parser.add_argument("--prefix",
	27	+ type=str,
	28	+ help="prefix of saved files")
	29	+ parser.add_argument("--outdir",
	30	+ default=None,
	31	+ type=str,
	32	+ help="Output directory")
	33	+
	34	+ args = parser.parse_args()
	35	+
	36	+ assert args.outdir
	37	+
	38	+ start = time.time()
	39	+
	40	+ # Load features and utt2
	41	+ features = read_features(args.features)
	42	+ utt2 = read_lst(args.utt2)
	43	+
	44	+ ids = list(features.keys())
	45	+ feats = np.vstack([ features[id_] for id_ in ids ])
	46	+ classes = [ utt2[id_] for id_ in ids ]
	47	+
	48	+ # Encode labels
	49	+ le = LabelEncoder()
	50	+ labels = le.fit_transform(classes)
	51	+ num_classes = len(le.classes_)
	52	+
	53	+ # Compute KMEANS clustering on data
	54	+ estimator = KMeans(
	55	+ n_clusters=num_classes,
	56	+ n_init=100,
	57	+ tol=10-6,
	58	+ algorithm="elkan"
	59	+ )
	60	+ estimator.fit(feats)
	61	+ print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}")
	62	+
	63	+ # contains distance to each cluster for each sample
	64	+ dist_space = estimator.transform(feats)
	65	+ predictions = np.argmin(dist_space, axis=1)
	66	+
	67	+ # gives each cluster a name (considering most represented character)
	68	+ dataframe = pd.DataFrame({
	69	+ "label": pd.Series(list(map(lambda x: le.classes_[x], labels))),
	70	+ "prediction": pd.Series(predictions)
	71	+ })
	72	+
	73	+ def find_cluster_name_fn(c):
	74	+ mask = dataframe["prediction"] == c
	75	+ return dataframe[mask]["label"].value_counts(sort=False).idxmax()
	76	+
	77	+ cluster_names = list(map(find_cluster_name_fn, range(num_classes)))
	78	+ predicted_labels = le.transform(
	79	+ [cluster_names[pred] for pred in predictions])
	80	+
	81	+ # F-measure
	82	+ fscores = f1_score(labels, predicted_labels, average=None)
	83	+ fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores))))
	84	+ print(f"F1-scores for each classes:\n{fscores_str}")
	85	+ print(f"Global score : {np.mean(fscores)}")
	86	+ with open(os.path.join(args.outdir, args.prefix + "eval_clustering.log"), "w") as fd:
	87	+ print(f"F1-scores for each classes:\n{fscores_str}", file=fd)
	88	+ print(f"Global score : {np.mean(fscores)}", file=fd)
	89	+
	90	+ # Process t-SNE and plot
	91	+ tsne_estimator = TSNE()
	92	+ embeddings = tsne_estimator.fit_transform(feats)
	93	+ print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format(
	94	+ tsne_estimator.n_iter_, tsne_estimator.kl_divergence_))
	95	+
	96	+ fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5))
	97	+ for c, name in enumerate(le.classes_):
	98	+ c_mask = np.where(labels == c)
	99	+ axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
	100	+
	101	+ try:
	102	+ id_cluster = cluster_names.index(name)
	103	+ except ValueError:
	104	+ print("WARNING: no cluster found for {}".format(name))
	105	+ continue
	106	+ c_mask = np.where(predictions == id_cluster)
	107	+ axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
	108	+
	109	+ axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
	110	+ axe1.set_title("true labels")
	111	+ axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
	112	+ axe2.set_title("predicted cluster label")
	113	+
	114	+ plt.suptitle("Kmeans Clustering")
	115	+
	116	+ loc = os.path.join(
	117	+ args.outdir,
	118	+ args.prefix + "kmeans.pdf"
	119	+ )
	120	+ plt.savefig(loc, bbox_inches="tight")
	121	+ plt.close()
	122	+
	123	+ print("INFO: figure saved at {}".format(loc))
	124	+
	125	+ end = time.time()
	126	+ print("program ended in {0:.2f} seconds".format(end-start))