Quillot Mathias / volia

Commit e7d811503f88129fb1d8eb28dd6af09f681a771e

Authored by Quillot Mathias 2021-04-21 15:06:03 +0200

Exists in master

New file architecture. Now scripts are on volia's directory and the library is o…

…n the core directory.

Showing 14 changed files with 419 additions and 670 deletions Inline Diff

scripts/data-management/convert-old.py
scripts/data-management/filter_ids.py
scripts/dim-reduction/tsne.py
scripts/evaluations/clustering.py
scripts/plot/plot-character.py
volia/convert-old.py
volia/core/data.py
volia/core/measures.py
volia/data_io.py
volia/filter_ids.py
volia/measures.py
volia/plot-character.py
volia/test.py
volia/tsne.py

scripts/data-management/convert-old.py

View file @ e7d8115

1	import argparse		File was deleted
2	from os.path import isfile
3
4
5	if __name__ == "__main__":
6
7	parser = argparse.ArgumentParser(
8	description="Convert old files with wrong id to new one. Masseffect.")
9
10	parser.add_argument("file", type=str, help="feature, x2x, or list file")
11	parser.add_argument("--outfile", type=str, default="out.txt", help="output file")
12
13	args = parser.parse_args()
14
15	assert isfile(args.file), "The given file does not exist."
16
17	with open(args.file, "r") as f, open(args.outfile, "w") as of:
18	for line in f:
19	splited = line.replace("\n", "").split(" ")
20	metas = splited[0].split(",")
21	metas.pop(2)
22	splited[0] = ",".join(metas)
23	of.write(" ".join(splited) + "\n")
24		1	import argparse

scripts/data-management/filter_ids.py

View file @ e7d8115

1	import argparse	File was deleted
2	from os.path import isfile
3	from volia.data_io import read_lst
4
5	if __name__ == "__main__":
6	parser = argparse.ArgumentParser(description="Filter ids of the given file to only keep a subset")
7	parser.add_argument("file", type=str, help="")
8	parser.add_argument("--filter", default=None, type=str, help="")
9	parser.add_argument("--outfile", default="out.txt", type=str, help="")
10
11	args = parser.parse_args()
12
13	assert args.filter is not None
14	assert isfile(args.file)
15
16	list_ = read_lst(args.file)
17	filter_ = read_lst(args.filter)
18
19	with open(args.outfile, "w") as of:
20	for key in filter_.keys():
21	of.write(key + " " + " ".join(list_[key]) + "\n")
22
23	print("File filtered and written in: ", args.outfile)

scripts/dim-reduction/tsne.py

View file @ e7d8115

1	'''	File was deleted
2	The goal of this script is to display calculate tsne of pvectors.
3	'''
4
5	import os
6	from os.path import isfile
7	import argparse
8	import numpy as np
9	from sklearn.manifold import TSNE
10
11	from volia.data_io import read_features
12
13	if __name__ == "__main__":
14	# Defining argparse
15	parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d')
16	parser.add_argument('features', type=str,
17	help='the path of the file you want to calculate tsne')
18	parser.add_argument('-o', '--outfile', type=str,
19	default='.',
20	help='the path of the output file.')
21	parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3],
22	default='2',
23	help='number of components output of tsne')
24
25	args = parser.parse_args()
26
27	assert isfile(args.features)
28
29	features_list = read_features(args.features)
30	tuples_key_feat = np.vstack([ (key, feats) for key, feats in features_list.items()])
31	keys, features = zip(*tuples_key_feat)
32	feat_tsne = TSNE(n_components=args.n_comp).fit_transform(features)
33
34	with open(args.outfile, "w") as of:
35	for i in range(len(keys)):
36	of.write(keys[i] + " " + " ".join([str(feat) for feat in feat_tsne[i]]) + "\n")
37	print("TSNE finished. Check if everything has been done well.")

scripts/evaluations/clustering.py

View file @ e7d8115

1	'''	File was deleted
2	This script allows the user to evaluate a classification system on new labels using clustering methods.
3	The algorithms are applied on the given latent space (embedding).
4	'''
5	import argparse
6	import numpy as np
7	import pandas as pd
8	import os
9	import time
10	import pickle
11	import csv
12	import json
13
14	from sklearn.preprocessing import LabelEncoder
15	from sklearn.metrics.pairwise import pairwise_distances
16	from sklearn.cluster import KMeans
17	from sklearn.manifold import TSNE
18	from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score
19	import matplotlib.pyplot as plt
20
21	from volia.data_io import read_features,read_lst
22	from volia.measures import entropy_score, purity_score
23
24	'''
25	TODO:
26	- Add an option allowing the user to choose the number of
27	clustering to train in order to compute the average and the
28	'''
29
30
31	def train_clustering(label_encoder, feats, classes, outdir):
32	num_classes = len(label_encoder.classes_)
33	estimator = None
34	kmeans_filepath = os.path.join(outdir, f"{args.prefix}kmeans.pkl")
35	if args.onlymeasures:
36	print(f"Loading model: {kmeans_filepath}")
37	with open(kmeans_filepath, "rb") as f:
38	estimator = pickle.load(f)
39	else:
40	# Compute KMEANS clustering on data
41	print("Saving parameters")
42	kmeans_parameters = {
43	"n_clusters": num_classes,
44	"n_init": 100,
45	"tol": 10-6,
46	"algorithm": "elkan"
47	}
48	with open(os.path.join(outdir, f"{args.prefix}kmeans_parameters.json"), "w") as f:
49	json.dump(kmeans_parameters, f)
50
51	# Fit the model and Save parameters
52	print(f"Fit the model: {kmeans_filepath}")
53	estimator = KMeans(
54	**kmeans_parameters
55	)
56	estimator.fit(feats)
57	print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}")
58
59	with open(kmeans_filepath, "wb") as f:
60	pickle.dump(estimator, f)
61
62	# contains distance to each cluster for each sample
63	dist_space = estimator.transform(feats)
64	predictions = np.argmin(dist_space, axis=1)
65
66	# gives each cluster a name (considering most represented character)
67	dataframe = pd.DataFrame({
68	"label": pd.Series(list(map(lambda x: le.classes_[x], labels))),
69	"prediction": pd.Series(predictions)
70	})
71
72	def find_cluster_name_fn(c):
73	mask = dataframe["prediction"] == c
74	return dataframe[mask]["label"].value_counts(sort=False).idxmax()
75
76	cluster_names = list(map(find_cluster_name_fn, range(num_classes)))
77	predicted_labels = le.transform(
78	[cluster_names[pred] for pred in predictions])
79
80	# F-measure
81	fscores = f1_score(labels, predicted_labels, average=None)
82	fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores))))
83
84	# Entropy
85	_, _, entropy = entropy_score(labels, predicted_labels)
86
87	# Homogenity
88	homogeneity = homogeneity_score(labels, predicted_labels)
89
90	# Completeness
91	completeness = completeness_score(labels, predicted_labels)
92
93	# V-Measure
94	v_measure = v_measure_score(labels, predicted_labels)
95
96	# Purity
97	purity_scores = purity_score(labels, predicted_labels)
98	purity_class_score = purity_scores["purity_class_score"]
99	purity_cluster_score = purity_scores["purity_cluster_score"]
100	K = purity_scores["K"]
101
102	# Write results
103	with open(os.path.join(outdir, args.prefix + "eval_clustering.log"), "w") as fd:
104	print(f"F1-scores for each classes:\n{fscores_str}", file=fd)
105	print(f"Entropy: {entropy}", file=fd)
106	print(f"Global score : {np.mean(fscores)}", file=fd)
107	print(f"Homogeneity: {homogeneity}", file=fd)
108	print(f"completeness: {completeness}", file=fd)
109	print(f"v-measure: {v_measure}", file=fd)
110	print(f"purity class score: {purity_class_score}", file=fd)
111	print(f"purity cluster score: {purity_cluster_score}", file=fd)
112	print(f"purity overall evaluation criterion (K): {K}", file=fd)
113
114	# Process t-SNE and plot
115	tsne_estimator = TSNE()
116	embeddings = tsne_estimator.fit_transform(feats)
117	print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format(
118	tsne_estimator.n_iter_, tsne_estimator.kl_divergence_))
119
120	fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5))
121	for c, name in enumerate(le.classes_):
122	c_mask = np.where(labels == c)
123	axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
124
125	try:
126	id_cluster = cluster_names.index(name)
127	except ValueError:
128	print("WARNING: no cluster found for {}".format(name))
129	continue
130	c_mask = np.where(predictions == id_cluster)
131	axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
132
133	axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
134	axe1.set_title("true labels")
135	axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
136	axe2.set_title("predicted cluster label")
137
138	plt.suptitle("Kmeans Clustering")
139
140	loc = os.path.join(
141	outdir,
142	args.prefix + "kmeans.pdf"
143	)
144	plt.savefig(loc, bbox_inches="tight")
145	plt.close()
146
147	print("INFO: figure saved at {}".format(loc))
148
149	end = time.time()
150	print("program ended in {0:.2f} seconds".format(end-start))
151	return {
152	"f1": np.mean(fscores),
153	"entropy": entropy,
154	"homogeneity": homogeneity,
155	"completeness": completeness,
156	"v-measure": v_measure,
157	"purity_class_score": purity_class_score,
158	"purity_cluster score": purity_cluster_score,
159	"K": K
160	}
161
162
163	if __name__ == "__main__":
164	# Argparse
165	parser = argparse.ArgumentParser("Compute clustering on a latent space")
166	parser.add_argument("features")
167	parser.add_argument("utt2",
168	type=str,
169	help="file with [utt] [value]")
170	parser.add_argument("--idsfrom",
171	type=str,
172	default="utt2",
173	choices=[
174	"features",
175	"utt2"
176	],
177	help="from features or from utt2?")
178	parser.add_argument("--prefix",
179	default="",
180	type=str,
181	help="prefix of saved files")
182	parser.add_argument("--outdir",
183	default=None,
184	type=str,
185	help="Output directory")
186	parser.add_argument("--nmodels",
187	type=int,
188	default=1,
189	help="specifies the number of models to train")
190	parser.add_argument("--onlymeasures",
191	action='store_true',
192	help="Don't compute the clustering, compute only the measures")
193	args = parser.parse_args()
194
195	assert args.outdir
196
197	start = time.time()
198
199	# Load features and utt2
200	features = read_features(args.features)
201	utt2 = read_lst(args.utt2)
202
203	# Take id list
204	if args.idsfrom == "features":
205	ids = list(features.keys())
206	elif args.idsfrom == "utt2":
207	ids = list(utt2.keys())
208	else:
209	print(f"idsfrom is not good: {args.idsfrom}")
210	exit(1)
211
212	feats = np.vstack([ features[id_] for id_ in ids ])
213	classes = [ utt2[id_] for id_ in ids ]
214
215	# Encode labels
216	le = LabelEncoder()
217	labels = le.fit_transform(classes)
218
219	measures = {}
220	for i in range(1, args.nmodels+1):
221	subdir = os.path.join(args.outdir, str(i))
222	if not os.path.exists(subdir):
223	os.mkdir(subdir)
224	print(f"[{i}/{args.nmodels}] => {subdir}")
225	results = train_clustering(le, feats, classes, subdir)
226
227	for key, value in results.items():
228	if key not in measures:
229	measures[key] = []
230	measures[key].append(results[key])
231
232
233	# File with results
234	file_results = os.path.join(args.outdir, args.prefix + "clustering_measures.txt")
235
236	with open(file_results, "w") as f:
237	f.write(f"[nmodels: {args.nmodels}]\n")
238	for key in measures.keys():
239	values = np.asarray(measures[key], dtype=float)
240	mean = np.mean(values)
241	std = np.std(values)
242	f.write(f"[{key} => mean: {mean}, std: {std}] \n")
243
244	# CSV File with all the values
245	file_csv_measures = os.path.join(args.outdir, args.prefix + "clustering_measures.csv")
246
247	with open(file_csv_measures, "w", newline="") as f:
248	writer = csv.writer(f, delimiter=",")
249	writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"])
250	for key in measures.keys():
251	values = np.asarray(measures[key], dtype=float)
252	mean = np.mean(values)
253	std = np.std(values)
254	writer.writerow([key] + list(values) + [mean] + [std])

scripts/plot/plot-character.py

View file @ e7d8115

1			File was deleted
2	import matplotlib.pyplot as plt
3	import numpy as np
4	import pandas as pd
5	import argparse
6	from os.path import isfile
7	from volia.data_io import read_features, read_lst
8
9
10	if __name__ == "__main__":
11	# Argparse
12	parser = argparse.ArgumentParser(description="Plot points with color for each character")
13	parser.add_argument("--features", type=str, help="features file path")
14	parser.add_argument("--utt2char", type=str, help="char2utt file path")
15	parser.add_argument("--sublist", type=str, default=None, help="white list of ids to take into account")
16	parser.add_argument("--outfile", default="out.pdf", type=str, help="")
17	parser.add_argument("--title", default="Example of plot", type=str, help="Specify the title")
18	args = parser.parse_args()
19
20	# List of assertions
21	assert args.features, "Need to specify features option"
22	assert args.utt2char, "Need to specify char2utt option file"
23	assert isfile(args.features), "Features path should point to a file"
24	assert isfile(args.utt2char), "char2utt path should point to a file"
25	if args.sublist is not None:
26	assert isfile(args.sublist), "sublist path should point to a file"
27
28
29	id_to_features = read_features(args.features)
30
31	ids = []
32	if args.sublist is not None:
33	print("Using sublist")
34	list_ids = read_lst(args.sublist)
35	ids = [ key for key in list_ids.keys() ]
36	else:
37	ids = [ key for key in id_to_features.keys() ]
38
39	utt2char = read_lst(args.utt2char)
40
41	features = [ id_to_features[id_] for id_ in ids ]
42	features = np.vstack(features)
43
44	characters_list = [ utt2char[id_][0] for id_ in ids ]
45
46	features_T = features.transpose()
47	print("Number of characters: ", len(np.unique(characters_list)))
48	df = pd.DataFrame(dict(
49	x=features_T[0],
50	y=features_T[1],
51	character=characters_list))
52
53	groups = df.groupby('character')
54
55	# Plot
56	fig, ax = plt.subplots()
57
58	for character, group in groups:
59	p = ax.plot(group.x, group.y, marker='o', linestyle='', ms=1, label=character)
60	ax.legend()
61	plt.savefig(args.outfile)
62	print("Your plot is saved well (no check of this affirmation)")
63		1

volia/convert-old.py

Diff comments View file @ e7d8115

File was created	1	import argparse
	2	from os.path import isfile
	3
	4
	5	if __name__ == "__main__":
	6
	7	parser = argparse.ArgumentParser(
	8	description="Convert old files with wrong id to new one. Masseffect.")
	9
	10	parser.add_argument("file", type=str, help="feature, x2x, or list file")
	11	parser.add_argument("--outfile", type=str, default="out.txt", help="output file")
	12
	13	args = parser.parse_args()
	14
	15	assert isfile(args.file), "The given file does not exist."
	16
	17	with open(args.file, "r") as f, open(args.outfile, "w") as of:
	18	for line in f:
	19	splited = line.replace("\n", "").split(" ")
	20	metas = splited[0].split(",")
	21	metas.pop(2)
	22	splited[0] = ",".join(metas)
	23	of.write(" ".join(splited) + "\n")
	24

volia/core/data.py

Diff comments View file @ e7d8115

File was created	1	'''
	2	Data management input/output
	3	'''
	4
	5	# Import packages and modules
	6	import numpy as np
	7
	8	# Defining some types
	9	from typing import List, Dict
	10	KeyToList = Dict[str, List[str]]
	11	KeyToFeatures = Dict[str, List[float]]
	12
	13
	14	def read_lst(file_path: str) -> KeyToList:
	15	'''
	16	Read lst file with this structure:
	17	[id] [value1] [value2] ... [value n]
	18
	19	This is a basic function reused by others like read_features.
	20	returns a dictionary with id as key and a list of value as corresponding values
	21	'''
	22	# KeyToList type variable
	23	key_to_list = dict()
	24	with open(file_path, "r") as f:
	25	for line in f:
	26	splited = line.replace("\n", "").split(" ")
	27	id = splited[0]
	28	values = splited[1:]
	29	key_to_list[id] = values
	30	return key_to_list
	31
	32
	33	def read_features(file_path: str) -> KeyToFeatures:
	34	'''
	35	'''
	36	# KeyToFeatures type variable
	37	key_to_features = dict()
	38	# and the KeyToList
	39	key_to_list = read_lst(file_path)
	40
	41	for key_, list_ in key_to_list.items():
	42	key_to_features[key_] = np.asarray(list_, dtype=float)
	43
	44	return key_to_features

volia/core/measures.py

Diff comments View file @ e7d8115

File was created	1	'''
	2	This module is a part of my library.
	3	It aims to compute some measures for clustering.
	4	'''
	5
	6	import numpy as np
	7
	8	def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
	9	'''
	10	Compute disequilibrium for all the clusters.
	11	The disequilibrium is compute from the difference
	12	between two clustering sets.
	13	isGlobal permet à l'utilisateur de choisir le dénominateur de
	14	la fonction :
	15	- True : divise la valeur par le nombre d'élément du cluster
	16	- False : divise la valeur par le nombre d'élément total
	17
	18	withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
	19	une valeur absolue.
	20	'''
	21
	22	def divide_line(a, divider):
	23	'''
	24	Sub function used for dividing matrix by a vector line by line.
	25	'''
	26	return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
	27
	28	dividers1 = 0
	29	dividers2 = 0
	30
	31	if isGlobal:
	32	dividers1 = matrix1.sum()
	33	dividers2 = matrix2.sum()
	34	else:
	35	dividers1 = matrix1.sum(axis=1)
	36	dividers2 = matrix2.sum(axis=1)
	37
	38	matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
	39
	40	matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
	41
	42	diff = matrix1_divided - matrix2_divided
	43
	44	mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
	45
	46	result = diff
	47
	48	if mod != None or mod == "":
	49	for word in mod.split(" "):
	50	if word == "power":
	51	result = np.power(result,2)
	52	elif word == "human":
	53	result = result * 100
	54	elif word == "abs":
	55	result = np.absolute(result)
	56	else:
	57	raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
	58	return (mask, result)
	59
	60
	61
	62	def disequilibrium_mean_by_cluster(mask, matrix):
	63	'''
	64	Mean of disequilibrium
	65	matrix is the disequilibrium calculated
	66	from number of occurences belonging to a class,
	67	for each cluster.
	68	'''
	69	nb_k = len(matrix)
	70	results = np.zeros((nb_k))
	71
	72	for i in range(nb_k):
	73	results[i] = matrix[i].sum() / mask[i].sum()
	74	return results
	75
	76
	77	def disequilibrium(matrix1, matrix2, isGlobal=False):
	78	'''
	79	Disequilibrium matrix
	80	And Disequilibrium value
	81	'''
	82	mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
	83	result_human = result * 100
	84	result_power = np.power(result, 2)
	85
	86	return (
	87	mask,
	88	result_human,
	89	disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
	90	)
	91
	92
	93	def compute_count_matrix(y_truth, y_hat):
	94	'''
	95	Check the size of the lists with assertion
	96	'''
	97	# Check size of the lists
	98	assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
	99
	100	# Build count matrix
	101	count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
	102	for i in range(len(y_hat)):
	103	count_matrix[y_hat[i]][y_truth[i]] += 1
	104	return count_matrix
	105
	106
	107	def entropy_score(y_truth, y_hat):
	108	'''
	109	Need to use label encoder before givin y_hat and y_truth
	110	Don't use one hot labels
	111
	112	Return a tuple with:
	113	- result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
	114	- result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
	115	- result : the final entropy measure of the clustering
	116	'''
	117	def divide_line(a, divider):
	118	'''
	119	Sub function used for dividing matrix by a vector line by line.
	120	'''
	121	return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
	122
	123	# Build count matrix
	124	count_matrix = compute_count_matrix(y_truth, y_hat)
	125
	126	# Build dividers vector
	127	dividers = count_matrix.sum(axis=1)
	128
	129	matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
	130
	131	log_matrix = np.zeros(matrix_divided.shape)
	132	np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
	133	result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
	134	result_vector = result_matrix.sum(axis=1)
	135	result_vector.sum()
	136
	137	if np.isnan(np.sum(result_vector)):
	138	print("COUNT MATRIX")
	139	print(count_matrix)
	140	print("MATRIX DIVIDED")
	141	print(matrix_divided)
	142	print("RESULT MATRIX")
	143	print(result_matrix)
	144	print("VECTOR MATRIX")
	145	print(result_vector)
	146	print("An error occured due to nan value, some values are printed before")
	147	exit(1)
	148
	149	result = result_vector * dividers / dividers.sum()
	150	result = result.sum()
	151	return (result_matrix, result_vector, result)
	152
	153
	154	def purity_score(y_truth, y_hat):
	155	'''
	156	Return three values in a dictionary:
	157	- purity_class_score: the purity score of the class (asp)
	158	- purity_cluster_score: the purity score of the cluster (acp)
	159	- K: the overall evaluation criterion (sqrt(asp * acp))
	160
	161	This function is based on the following article:
	162	Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan
	163	'''
	164
	165	def divide_line(a, divider):
	166	'''
	167	Sub function used for dividing matrix by a vector line by line.
	168	'''
	169	return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
	170
	171	def compute_purity_score(count_matrix, axis=0):
	172	if axis==0:
	173	other_axis = 1
	174	else:
	175	other_axis = 0
	176	count_per_row = count_matrix.sum(axis=axis)
	177	dividers = np.square(count_per_row)
	178
	179	count_matrix_squared = np.square(count_matrix)
	180	matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers)
	181	vector_purity = np.sum(matrix_divided, axis=axis)
	182
	183	scalar_purity = np.average(vector_purity, weights=count_per_row)
	184	return (vector_purity, scalar_purity)
	185
	186
	187	count_matrix = compute_count_matrix(y_truth, y_hat)
	188	_, purity_cluster_score = compute_purity_score(count_matrix, 1)
	189	_, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
	190
	191	K = np.sqrt(purity_cluster_score * purity_class_score)
	192
	193	for i in range(count_matrix.shape[0]):
	194	for j in range(count_matrix.shape[1]):
	195	count_matrix[i][j]
	196	count_matrix[i]
	197	return {
	198	"purity_class_score": purity_class_score,
	199	"purity_cluster_score": purity_cluster_score,
	200	"K": K
	201	}
	202
	203
	204	if __name__ == "__main__":
	205	print("Purity test #1")
	206	# Hypothesis
	207	y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
	208	# Truth
	209	y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
	210
	211	(result_matrix, result_vector, result) = entropy_score(y, y_hat)
	212	print(purity_score(y, y_hat))
	213
	214	exit(1)
	215	print("Purity test #2")
	216	# Hypothesis
	217	y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4])
	218	# Truth
	219	y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3])
	220
	221	(result_matrix, result_vector, result) = entropy_score(y, y_hat)
	222	exit(1)
	223	print("Result matrix: ")
	224	print(result_matrix)
	225	print("Result vector: ")
	226	print(result_vector)
	227	print("Result: ", result)

volia/data_io.py

View file @ e7d8115

1	'''	File was deleted
2	Data management input/output
3	'''
4
5	# Import packages and modules
6	import numpy as np
7
8	# Defining some types
9	from typing import List, Dict
10	KeyToList = Dict[str, List[str]]
11	KeyToFeatures = Dict[str, List[float]]
12
13
14	def read_lst(file_path: str) -> KeyToList:
15	'''
16	Read lst file with this structure:
17	[id] [value1] [value2] ... [value n]
18
19	This is a basic function reused by others like read_features.
20	returns a dictionary with id as key and a list of value as corresponding values
21	'''
22	# KeyToList type variable
23	key_to_list = dict()
24	with open(file_path, "r") as f:
25	for line in f:
26	splited = line.replace("\n", "").split(" ")
27	id = splited[0]
28	values = splited[1:]
29	key_to_list[id] = values
30	return key_to_list
31
32
33	def read_features(file_path: str) -> KeyToFeatures:
34	'''
35	'''
36	# KeyToFeatures type variable
37	key_to_features = dict()
38	# and the KeyToList
39	key_to_list = read_lst(file_path)
40
41	for key_, list_ in key_to_list.items():
42	key_to_features[key_] = np.asarray(list_, dtype=float)
43
44	return key_to_features

volia/filter_ids.py

Diff comments View file @ e7d8115

File was created	1	import argparse
	2	from os.path import isfile
	3	#from volia.data_io import read_lst
	4
	5	import volia
	6	if __name__ == "__main__":
	7	parser = argparse.ArgumentParser(description="Filter ids of the given file to only keep a subset")
	8	parser.add_argument("file", type=str, help="")
	9	parser.add_argument("--filter", default=None, type=str, help="")
	10	parser.add_argument("--outfile", default="out.txt", type=str, help="")
	11
	12	args = parser.parse_args()
	13
	14	assert args.filter is not None
	15	assert isfile(args.file)
	16
	17	list_ = read_lst(args.file)
	18	filter_ = read_lst(args.filter)
	19
	20	with open(args.outfile, "w") as of:
	21	for key in filter_.keys():
	22	of.write(key + " " + " ".join(list_[key]) + "\n")
	23
	24	print("File filtered and written in: ", args.outfile)

volia/measures.py

View file @ e7d8115

1	'''	File was deleted
2	This module is a part of my library.
3	It aims to compute some measures for clustering.
4	'''
5
6	import numpy as np
7
8	def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
9	'''
10	Compute disequilibrium for all the clusters.
11	The disequilibrium is compute from the difference
12	between two clustering sets.
13	isGlobal permet à l'utilisateur de choisir le dénominateur de
14	la fonction :
15	- True : divise la valeur par le nombre d'élément du cluster
16	- False : divise la valeur par le nombre d'élément total
17
18	withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
19	une valeur absolue.
20	'''
21
22	def divide_line(a, divider):
23	'''
24	Sub function used for dividing matrix by a vector line by line.
25	'''
26	return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
27
28	dividers1 = 0
29	dividers2 = 0
30
31	if isGlobal:
32	dividers1 = matrix1.sum()
33	dividers2 = matrix2.sum()
34	else:
35	dividers1 = matrix1.sum(axis=1)
36	dividers2 = matrix2.sum(axis=1)
37
38	matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
39
40	matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
41
42	diff = matrix1_divided - matrix2_divided
43
44	mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
45
46	result = diff
47
48	if mod != None or mod == "":
49	for word in mod.split(" "):
50	if word == "power":
51	result = np.power(result,2)
52	elif word == "human":
53	result = result * 100
54	elif word == "abs":
55	result = np.absolute(result)
56	else:
57	raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
58	return (mask, result)
59
60
61
62	def disequilibrium_mean_by_cluster(mask, matrix):
63	'''
64	Mean of disequilibrium
65	matrix is the disequilibrium calculated
66	from number of occurences belonging to a class,
67	for each cluster.
68	'''
69	nb_k = len(matrix)
70	results = np.zeros((nb_k))
71
72	for i in range(nb_k):
73	results[i] = matrix[i].sum() / mask[i].sum()
74	return results
75
76
77	def disequilibrium(matrix1, matrix2, isGlobal=False):
78	'''
79	Disequilibrium matrix
80	And Disequilibrium value
81	'''
82	mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
83	result_human = result * 100
84	result_power = np.power(result, 2)
85
86	return (
87	mask,
88	result_human,
89	disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
90	)
91
92
93	def compute_count_matrix(y_truth, y_hat):
94	'''
95	Check the size of the lists with assertion
96	'''
97	# Check size of the lists
98	assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
99
100	# Build count matrix
101	count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
102	for i in range(len(y_hat)):
103	count_matrix[y_hat[i]][y_truth[i]] += 1
104	return count_matrix
105
106
107	def entropy_score(y_truth, y_hat):
108	'''
109	Need to use label encoder before givin y_hat and y_truth
110	Don't use one hot labels
111
112	Return a tuple with:
113	- result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
114	- result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
115	- result : the final entropy measure of the clustering
116	'''
117	def divide_line(a, divider):
118	'''
119	Sub function used for dividing matrix by a vector line by line.
120	'''
121	return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
122
123	# Build count matrix
124	count_matrix = compute_count_matrix(y_truth, y_hat)
125
126	# Build dividers vector
127	dividers = count_matrix.sum(axis=1)
128
129	matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
130
131	log_matrix = np.zeros(matrix_divided.shape)
132	np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
133	result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
134	result_vector = result_matrix.sum(axis=1)
135	result_vector.sum()
136
137	if np.isnan(np.sum(result_vector)):
138	print("COUNT MATRIX")
139	print(count_matrix)
140	print("MATRIX DIVIDED")
141	print(matrix_divided)
142	print("RESULT MATRIX")
143	print(result_matrix)
144	print("VECTOR MATRIX")
145	print(result_vector)
146	print("An error occured due to nan value, some values are printed before")
147	exit(1)
148
149	result = result_vector * dividers / dividers.sum()
150	result = result.sum()
151	return (result_matrix, result_vector, result)
152
153
154	def purity_score(y_truth, y_hat):
155	'''
156	Return three values in a dictionary:
157	- purity_class_score: the purity score of the class (asp)
158	- purity_cluster_score: the purity score of the cluster (acp)
159	- K: the overall evaluation criterion (sqrt(asp * acp))
160
161	This function is based on the following article:
162	Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan
163	'''
164
165	def divide_line(a, divider):
166	'''
167	Sub function used for dividing matrix by a vector line by line.
168	'''
169	return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
170
171	def compute_purity_score(count_matrix, axis=0):
172	if axis==0:
173	other_axis = 1
174	else:
175	other_axis = 0
176	count_per_row = count_matrix.sum(axis=axis)
177	dividers = np.square(count_per_row)
178
179	count_matrix_squared = np.square(count_matrix)
180	matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers)
181	vector_purity = np.sum(matrix_divided, axis=axis)
182
183	scalar_purity = np.average(vector_purity, weights=count_per_row)
184	return (vector_purity, scalar_purity)
185
186
187	count_matrix = compute_count_matrix(y_truth, y_hat)
188	_, purity_cluster_score = compute_purity_score(count_matrix, 1)
189	_, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
190
191	K = np.sqrt(purity_cluster_score * purity_class_score)
192
193	for i in range(count_matrix.shape[0]):
194	for j in range(count_matrix.shape[1]):
195	count_matrix[i][j]
196	count_matrix[i]
197	return {
198	"purity_class_score": purity_class_score,
199	"purity_cluster_score": purity_cluster_score,
200	"K": K
201	}
202
203
204	if __name__ == "__main__":
205	print("Purity test #1")
206	# Hypothesis
207	y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
208	# Truth
209	y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
210
211	(result_matrix, result_vector, result) = entropy_score(y, y_hat)
212	print(purity_score(y, y_hat))
213
214	exit(1)
215	print("Purity test #2")
216	# Hypothesis
217	y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4])
218	# Truth
219	y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3])
220
221	(result_matrix, result_vector, result) = entropy_score(y, y_hat)
222	exit(1)
223	print("Result matrix: ")
224	print(result_matrix)
225	print("Result vector: ")
226	print(result_vector)
227	print("Result: ", result)

volia/plot-character.py

Diff comments View file @ e7d8115

File was created	1
	2	import matplotlib.pyplot as plt
	3	import numpy as np
	4	import pandas as pd
	5	import argparse
	6	from os.path import isfile
	7	from volia.data_io import read_features, read_lst
	8
	9
	10	if __name__ == "__main__":
	11	# Argparse
	12	parser = argparse.ArgumentParser(description="Plot points with color for each character")
	13	parser.add_argument("--features", type=str, help="features file path")
	14	parser.add_argument("--utt2char", type=str, help="char2utt file path")
	15	parser.add_argument("--sublist", type=str, default=None, help="white list of ids to take into account")
	16	parser.add_argument("--outfile", default="out.pdf", type=str, help="")
	17	parser.add_argument("--title", default="Example of plot", type=str, help="Specify the title")
	18	args = parser.parse_args()
	19
	20	# List of assertions
	21	assert args.features, "Need to specify features option"
	22	assert args.utt2char, "Need to specify char2utt option file"
	23	assert isfile(args.features), "Features path should point to a file"
	24	assert isfile(args.utt2char), "char2utt path should point to a file"
	25	if args.sublist is not None:
	26	assert isfile(args.sublist), "sublist path should point to a file"
	27
	28
	29	id_to_features = read_features(args.features)
	30
	31	ids = []
	32	if args.sublist is not None:
	33	print("Using sublist")
	34	list_ids = read_lst(args.sublist)
	35	ids = [ key for key in list_ids.keys() ]
	36	else:
	37	ids = [ key for key in id_to_features.keys() ]
	38
	39	utt2char = read_lst(args.utt2char)
	40
	41	features = [ id_to_features[id_] for id_ in ids ]
	42	features = np.vstack(features)
	43
	44	characters_list = [ utt2char[id_][0] for id_ in ids ]
	45
	46	features_T = features.transpose()
	47	print("Number of characters: ", len(np.unique(characters_list)))
	48	df = pd.DataFrame(dict(
	49	x=features_T[0],
	50	y=features_T[1],
	51	character=characters_list))
	52
	53	groups = df.groupby('character')
	54
	55	# Plot
	56	fig, ax = plt.subplots()
	57
	58	for character, group in groups:
	59	p = ax.plot(group.x, group.y, marker='o', linestyle='', ms=1, label=character)
	60	ax.legend()
	61	plt.savefig(args.outfile)
	62	print("Your plot is saved well (no check of this affirmation)")
	63

volia/test.py

Diff comments View file @ e7d8115

	File was created	1	if __name__ == "__main__":
		2	print("Volia is well installed.")

volia/tsne.py

Diff comments View file @ e7d8115

File was created	1	'''
	2	The goal of this script is to display calculate tsne of pvectors.
	3	'''
	4
	5	import os
	6	from os.path import isfile
	7	import argparse
	8	import numpy as np
	9	from sklearn.manifold import TSNE
	10
	11	from volia.data_io import read_features
	12
	13	if __name__ == "__main__":
	14	# Defining argparse
	15	parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d')
	16	parser.add_argument('features', type=str,
	17	help='the path of the file you want to calculate tsne')
	18	parser.add_argument('-o', '--outfile', type=str,
	19	default='.',
	20	help='the path of the output file.')
	21	parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3],
	22	default='2',
	23	help='number of components output of tsne')
	24
	25	args = parser.parse_args()
	26
	27	assert isfile(args.features)
	28
	29	features_list = read_features(args.features)
	30	tuples_key_feat = np.vstack([ (key, feats) for key, feats in features_list.items()])
	31	keys, features = zip(*tuples_key_feat)
	32	feat_tsne = TSNE(n_components=args.n_comp).fit_transform(features)
	33
	34	with open(args.outfile, "w") as of:
	35	for i in range(len(keys)):
	36	of.write(keys[i] + " " + " ".join([str(feat) for feat in feat_tsne[i]]) + "\n")
	37	print("TSNE finished. Check if everything has been done well.")