Quillot Mathias / volia

Browse Code »

Commit e403ed5fb6202dae56d47815d5961cced00f1c85

Authored by Mathias 2020-09-14 14:40:53 +0200

1 parent 11ee97e2cc

Exists in master

Add a script that allow user to evaluate a representation using classification labels.

Showing 1 changed file with 126 additions and 0 deletions Inline Diff

scripts/evaluations/clustering.py

scripts/evaluations/clustering.py

Diff comments View file @ e403ed5

File was created	1	'''
	2	This script allows the user to evaluate a classification system on new labels using clustering methods.
	3	The algorithms are applied on the given latent space (embedding).
	4	'''
	5	import argparse
	6	import numpy as np
	7	import pandas as pd
	8	import os
	9	import time
	10	from sklearn.preprocessing import LabelEncoder
	11	from sklearn.metrics.pairwise import pairwise_distances
	12	from sklearn.metrics import f1_score
	13	from sklearn.cluster import KMeans
	14	from sklearn.manifold import TSNE
	15	import matplotlib.pyplot as plt
	16
	17	from volia.data_io import read_features,read_lst
	18
	19	if __name__ == "__main__":
	20	# Argparse
	21	parser = argparse.ArgumentParser("Compute clustering on a latent space")
	22	parser.add_argument("features")
	23	parser.add_argument("utt2",
	24	type=str,
	25	help="file with [utt] [value]")
	26	parser.add_argument("--prefix",
	27	type=str,
	28	help="prefix of saved files")
	29	parser.add_argument("--outdir",
	30	default=None,
	31	type=str,
	32	help="Output directory")
	33
	34	args = parser.parse_args()
	35
	36	assert args.outdir
	37
	38	start = time.time()
	39
	40	# Load features and utt2
	41	features = read_features(args.features)
	42	utt2 = read_lst(args.utt2)
	43
	44	ids = list(features.keys())
	45	feats = np.vstack([ features[id_] for id_ in ids ])
	46	classes = [ utt2[id_] for id_ in ids ]
	47
	48	# Encode labels
	49	le = LabelEncoder()
	50	labels = le.fit_transform(classes)
	51	num_classes = len(le.classes_)
	52
	53	# Compute KMEANS clustering on data
	54	estimator = KMeans(
	55	n_clusters=num_classes,
	56	n_init=100,
	57	tol=10-6,
	58	algorithm="elkan"
	59	)
	60	estimator.fit(feats)
	61	print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}")
	62
	63	# contains distance to each cluster for each sample
	64	dist_space = estimator.transform(feats)
	65	predictions = np.argmin(dist_space, axis=1)
	66
	67	# gives each cluster a name (considering most represented character)
	68	dataframe = pd.DataFrame({
	69	"label": pd.Series(list(map(lambda x: le.classes_[x], labels))),
	70	"prediction": pd.Series(predictions)
	71	})
	72
	73	def find_cluster_name_fn(c):
	74	mask = dataframe["prediction"] == c
	75	return dataframe[mask]["label"].value_counts(sort=False).idxmax()
	76
	77	cluster_names = list(map(find_cluster_name_fn, range(num_classes)))
	78	predicted_labels = le.transform(
	79	[cluster_names[pred] for pred in predictions])
	80
	81	# F-measure
	82	fscores = f1_score(labels, predicted_labels, average=None)
	83	fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores))))
	84	print(f"F1-scores for each classes:\n{fscores_str}")
	85	print(f"Global score : {np.mean(fscores)}")
	86	with open(os.path.join(args.outdir, args.prefix + "eval_clustering.log"), "w") as fd:
	87	print(f"F1-scores for each classes:\n{fscores_str}", file=fd)
	88	print(f"Global score : {np.mean(fscores)}", file=fd)
	89
	90	# Process t-SNE and plot
	91	tsne_estimator = TSNE()
	92	embeddings = tsne_estimator.fit_transform(feats)
	93	print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format(
	94	tsne_estimator.n_iter_, tsne_estimator.kl_divergence_))
	95
	96	fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5))
	97	for c, name in enumerate(le.classes_):
	98	c_mask = np.where(labels == c)
	99	axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
	100
	101	try:
	102	id_cluster = cluster_names.index(name)
	103	except ValueError:
	104	print("WARNING: no cluster found for {}".format(name))
	105	continue
	106	c_mask = np.where(predictions == id_cluster)
	107	axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
	108
	109	axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
	110	axe1.set_title("true labels")
	111	axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
	112	axe2.set_title("predicted cluster label")
	113
	114	plt.suptitle("Kmeans Clustering")
	115
	116	loc = os.path.join(
	117	args.outdir,
	118	args.prefix + "kmeans.pdf"
	119	)
	120	plt.savefig(loc, bbox_inches="tight")
	121	plt.close()
	122
	123	print("INFO: figure saved at {}".format(loc))
	124
	125	end = time.time()
	126	print("program ended in {0:.2f} seconds".format(end-start))