Quillot Mathias / volia

Browse Code »

Commit 9191399c3b15f017c4a84edeacdb799b490c07e4

Authored by quillotm 2021-08-11 07:34:44 +0200

1 parent 40650f20d7

Exists in master

Clustering and evaluation are now availables and we can configure them through global variables.

Showing 3 changed files with 95 additions and 7 deletions Inline Diff

volia/clustering.py
volia/clustering_modules/kmeans.py
volia/core/measures.py

volia/clustering.py

Diff comments View file @ 9191399

 import argparse
 from os import path, mkdir
 from utils import SubCommandRunner
-from core.data import read_features, read_lst
+from core.data import read_features, read_lst, read_labels
 import numpy as np
 from sklearn.cluster import KMeans
 import pickle
+from clustering_modules.kmeans import kmeans
+from sklearn.preprocessing import LabelEncoder
+from sklearn.metrics import v_measure_score
+import core.measures
+CLUSTERING_METHODS = {
+    "k-means": kmeans()
+}
+EVALUATION_METHODS = {
+    "entropy": core.measures.entropy_score,
+    "v-measure": v_measure_score
+}
+def disequilibrium_run():
+    pass
+def measure_run(measure: str, features: str, lst: str, truelabels: str, model: str, modeltype: str):
+    module = CLUSTERING_METHODS[modeltype]
+    module.load(model)
+    evaluation = EVALUATION_METHODS[measure]
+    feats_dict = read_features(features)
+    labels_dict = read_labels(truelabels)
+    lst_dict = read_lst(lst)
+    lst_keys = [key for key in lst_dict]
+    feats = np.asarray([feats_dict[key] for key in lst_keys])
+    Y_pred = module.predict(feats)
+    Y_truth = [labels_dict[key][0] for key in lst_keys]
+    le = LabelEncoder()
+    le.fit(Y_truth)
+    Y_truth = le.transform(Y_truth)
+    eval = evaluation(Y_truth, Y_pred)
+    print(eval)
 def kmeans_run(features: str, lst: str, k:int, kmax: int, klist, output: str):
     """
     @param features: output features
     @param lst: list file
     @param k: k (kmin if kmax specified)
     @param kmax: maximum k to compute
     @param klist: list of k values to compute, ignore k value
     @param output: output file if kmax not specified, else, output directory
     """
-    # -- READE FILES --
+    # -- READ FILES --
     features_dict = read_features(features)
     lst_dict = read_lst(lst)
     X = np.asarray([features_dict[x] for x in lst_dict])
     # Exception cases
     if kmax is None and klist is None and path.isdir(output):
         raise Exception("The \"output\" is an existing directory while the system is waiting the path of a file.")
     if (kmax is not None or klist is not None) and path.isfile(output):
         raise Exception("The \"output\" is an existing file while the system is waiting the path of a directory.")
     # Mono value case
     if kmax is None and klist is None:
         print(f"Computing clustering with k={k}")
         kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
         preds = kmeans.predict(X)
         pickle.dump(kmeans, open(output, "wb"))
     # Multi values case with kmax
     if kmax is not None:
         if not path.isdir(output):
             mkdir(output)
         Ks = range(k, kmax + 1)
         for i in Ks:
             print(f"Computing clustering with k={i}")
             kmeans = KMeans(n_clusters=i, n_init=10, random_state=0).fit(X)
             preds = kmeans.predict(X)
             pickle.dump(kmeans, open(path.join(output, "clustering_" + str(i) + ".pkl"), "wb"))
     # Second multi values case with klist
     if klist is not None:
         if not path.isdir(output):
             mkdir(output)
         for k in klist:
             k = int(k)
             print(f"Computing clustering with k={k}")
             kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
             preds = kmeans.predict(X)
             pickle.dump(kmeans, open(path.join(output, "clustering_" + str(k) + ".pkl"), "wb"))
 if __name__ == "__main__":
     # Main parser
     parser = argparse.ArgumentParser(description="Clustering methods to apply")
     subparsers = parser.add_subparsers(title="action")
     # kmeans
     parser_kmeans = subparsers.add_parser(
         "kmeans", help="Compute clustering using k-means algorithm")
     parser_kmeans.add_argument("--features", required=True, type=str, help="Features file (works with list)")
     parser_kmeans.add_argument("--lst", required=True, type=str, help="List file (.lst)")
     parser_kmeans.add_argument("-k", default=2, type=int,
                                help="number of clusters to compute. It is kmin if kmax is specified.")
     parser_kmeans.add_argument("--kmax", default=None, type=int, help="if specified, k is kmin.")
     parser_kmeans.add_argument("--klist", nargs="+",
                                help="List of k values to test. As kmax, activate the multi values mod.")
     parser_kmeans.add_argument("--output", default=".kmeans", help="output file if only k. Output directory if multiple kmax specified.")
     parser_kmeans.set_defaults(which="kmeans")
+    # measure
+    parser_measure = subparsers.add_parser(
+        "measure", help="compute the entropy")
+    parser_measure.add_argument("--measure",
+                                required=True,
+                                type=str,
+                                choices=[key for key in EVALUATION_METHODS],
+                                help="...")
+    parser_measure.add_argument("--features", required=True, type=str, help="...")
+    parser_measure.add_argument("--lst", required=True, type=str, help="...")
+    parser_measure.add_argument("--truelabels", required=True, type=str, help="...")
+    parser_measure.add_argument("--model", required=True, type=str, help="...")
+    parser_measure.add_argument("--modeltype",
+                                required=True,
+                                choices=[key for key in CLUSTERING_METHODS],
+                                help="type of model for learning")
+    parser_measure.set_defaults(which="measure")
+    # disequilibrium
+    parser_disequilibrium = subparsers.add_parser(
+        "disequilibrium", help="...")
+    parser_disequilibrium.add_argument("--features", required=True, type=str, help="...")
+    parser_disequilibrium.add_argument("--lstrain", required=True, type=str, help="...")
+    parser_disequilibrium.add_argument("--lstest", required=True, type=str, help="...")
+    parser_disequilibrium.add_argument("--model", required=True, type=str, help="...")
+    parser_disequilibrium.add_argument("--model-type",
+                                required=True,
+                                choices=["kmeans", "2", "3"],
+                                help="...")
     # Parse
     args = parser.parse_args()
     # Run commands
     runner = SubCommandRunner({
-        "kmeans": kmeans_run
+        "kmeans": kmeans_run,
+        "measure": measure_run,
+        "disequilibrium": disequilibrium_run
     })
     runner.run(args.which, args.__dict__, remove="which")

volia/clustering_modules/kmeans.py

Diff comments View file @ 9191399

File was created	1
	2	from sklearn.cluster import KMeans
	3	import pickle
	4	from abstract_clustering import AbstractClustering
	5
	6	class kmeans():
	7	def __init__(self):
	8	self.kmeans_model = None
	9
	10	def predict(self, features):
	11	return self.kmeans_model.predict(features)
	12
	13	def load(self, model_path):
	14	self.kmeans_model = pickle.load(open(model_path, "rb"))
	15

volia/core/measures.py

Diff comments View file @ 9191399

1	'''	1	'''
2	This module is a part of my library.	2	This module is a part of my library.
3	It aims to compute some measures for clustering.	3	It aims to compute some measures for clustering.
4	'''	4	'''
5		5
6	import numpy as np	6	import numpy as np
7		7
8	def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):	8	def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
9	'''	9	'''
10	Compute disequilibrium for all the clusters.	10	Compute disequilibrium for all the clusters.
11	The disequilibrium is compute from the difference	11	The disequilibrium is compute from the difference
12	between two clustering sets.	12	between two clustering sets.
13	isGlobal permet à l'utilisateur de choisir le dénominateur de	13	isGlobal permet à l'utilisateur de choisir le dénominateur de
14	la fonction :	14	la fonction :
15	- True : divise la valeur par le nombre d'élément du cluster	15	- True : divise la valeur par le nombre d'élément du cluster
16	- False : divise la valeur par le nombre d'élément total	16	- False : divise la valeur par le nombre d'élément total
17		17
18	withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou	18	withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
19	une valeur absolue.	19	une valeur absolue.
20	'''	20	'''
21		21
22	def divide_line(a, divider):	22	def divide_line(a, divider):
23	'''	23	'''
24	Sub function used for dividing matrix by a vector line by line.	24	Sub function used for dividing matrix by a vector line by line.
25	'''	25	'''
26	return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)	26	return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
27		27
28	dividers1 = 0	28	dividers1 = 0
29	dividers2 = 0	29	dividers2 = 0
30		30
31	if isGlobal:	31	if isGlobal:
32	dividers1 = matrix1.sum()	32	dividers1 = matrix1.sum()
33	dividers2 = matrix2.sum()	33	dividers2 = matrix2.sum()
34	else:	34	else:
35	dividers1 = matrix1.sum(axis=1)	35	dividers1 = matrix1.sum(axis=1)
36	dividers2 = matrix2.sum(axis=1)	36	dividers2 = matrix2.sum(axis=1)
37		37
38	matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)	38	matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
39		39
40	matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)	40	matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
41		41
42	diff = matrix1_divided - matrix2_divided	42	diff = matrix1_divided - matrix2_divided
43		43
44	mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))	44	mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
45		45
46	result = diff	46	result = diff
47		47
48	if mod != None or mod == "":	48	if mod != None or mod == "":
49	for word in mod.split(" "):	49	for word in mod.split(" "):
50	if word == "power":	50	if word == "power":
51	result = np.power(result,2)	51	result = np.power(result,2)
52	elif word == "human":	52	elif word == "human":
53	result = result * 100	53	result = result * 100
54	elif word == "abs":	54	elif word == "abs":
55	result = np.absolute(result)	55	result = np.absolute(result)
56	else:	56	else:
57	raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")	57	raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
58	return (mask, result)	58	return (mask, result)
59		59
60		60
61		61
62	def disequilibrium_mean_by_cluster(mask, matrix):	62	def disequilibrium_mean_by_cluster(mask, matrix):
63	'''	63	'''
64	Mean of disequilibrium	64	Mean of disequilibrium
65	matrix is the disequilibrium calculated	65	matrix is the disequilibrium calculated
66	from number of occurences belonging to a class,	66	from number of occurences belonging to a class,
67	for each cluster.	67	for each cluster.
68	'''	68	'''
69	nb_k = len(matrix)	69	nb_k = len(matrix)
70	results = np.zeros((nb_k))	70	results = np.zeros((nb_k))
71		71
72	for i in range(nb_k):	72	for i in range(nb_k):
73	results[i] = matrix[i].sum() / mask[i].sum()	73	results[i] = matrix[i].sum() / mask[i].sum()
74	return results	74	return results
75		75
76		76
77	def disequilibrium(matrix1, matrix2, isGlobal=False):	77	def disequilibrium(matrix1, matrix2, isGlobal=False):
78	'''	78	'''
79	Disequilibrium matrix	79	Disequilibrium matrix
80	And Disequilibrium value	80	And Disequilibrium value
81	'''	81	'''
82	mask, result = disequilibrium_(matrix1, matrix2, isGlobal)	82	mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
83	result_human = result * 100	83	result_human = result * 100
84	result_power = np.power(result, 2)	84	result_power = np.power(result, 2)
85		85
86	return (	86	return (
87	mask,	87	mask,
88	result_human,	88	result_human,
89	disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]	89	disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
90	)	90	)
91		91
92		92
93	def compute_count_matrix(y_truth, y_hat):	93	def compute_count_matrix(y_truth, y_hat):
94	'''	94	'''
95	Check the size of the lists with assertion	95	Check the size of the lists with assertion
96	'''	96	'''
97	# Check size of the lists	97	# Check size of the lists
98	assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"	98	assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
99		99
100	# Build count matrix	100	# Build count matrix
101	count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))	101	count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
102	for i in range(len(y_hat)):	102	for i in range(len(y_hat)):
103	count_matrix[y_hat[i]][y_truth[i]] += 1	103	count_matrix[y_hat[i]][y_truth[i]] += 1
104	return count_matrix	104	return count_matrix
105		105
106		106
107	def entropy_score(y_truth, y_hat):	107	def entropy_score(y_truth, y_hat):
108	'''	108	'''
109	Need to use label encoder before givin y_hat and y_truth	109	Need to use label encoder before givin y_hat and y_truth
110	Don't use one hot labels	110	Don't use one hot labels
111		111
112	Return a tuple with:	112	Return a tuple with:
113	- result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))	113	- result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
114	- result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.	114	- result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
115	- result : the final entropy measure of the clustering	115	- result : the final entropy measure of the clustering
116	'''	116	'''
117	def divide_line(a, divider):	117	def divide_line(a, divider):
118	'''	118	'''
119	Sub function used for dividing matrix by a vector line by line.	119	Sub function used for dividing matrix by a vector line by line.
120	'''	120	'''
121	return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)	121	return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
122		122
123	# Build count matrix	123	# Build count matrix
124	count_matrix = compute_count_matrix(y_truth, y_hat)	124	count_matrix = compute_count_matrix(y_truth, y_hat)
125		125
126	# Build dividers vector	126	# Build dividers vector
127	dividers = count_matrix.sum(axis=1)	127	dividers = count_matrix.sum(axis=1)
128		128
129	matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)	129	matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
130		130
131	log_matrix = np.zeros(matrix_divided.shape)	131	log_matrix = np.zeros(matrix_divided.shape)
132	np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)	132	np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
133	result_matrix = -1 * np.multiply(matrix_divided, log_matrix)	133	result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
134	result_vector = result_matrix.sum(axis=1)	134	result_vector = result_matrix.sum(axis=1)
135	result_vector.sum()	135	result_vector.sum()
136		136
137	if np.isnan(np.sum(result_vector)):	137	if np.isnan(np.sum(result_vector)):
138	print("COUNT MATRIX")	138	print("COUNT MATRIX")
139	print(count_matrix)	139	print(count_matrix)
140	print("MATRIX DIVIDED")	140	print("MATRIX DIVIDED")
141	print(matrix_divided)	141	print(matrix_divided)
142	print("RESULT MATRIX")	142	print("RESULT MATRIX")
143	print(result_matrix)	143	print(result_matrix)
144	print("VECTOR MATRIX")	144	print("VECTOR MATRIX")
145	print(result_vector)	145	print(result_vector)
146	print("An error occured due to nan value, some values are printed before")	146	print("An error occured due to nan value, some values are printed before")
147	exit(1)	147	exit(1)
148		148
149	result = result_vector * dividers / dividers.sum()	149	result = result_vector * dividers / dividers.sum()
150	result = result.sum()	150	result = result.sum()
151	return (result_matrix, result_vector, result)	151	return result
152		152
153		153
154	def purity_score(y_truth, y_hat):	154	def purity_score(y_truth, y_hat):
155	'''	155	'''
156	Return three values in a dictionary:	156	Return three values in a dictionary:
157	- purity_class_score: the purity score of the class (asp)	157	- purity_class_score: the purity score of the class (asp)
158	- purity_cluster_score: the purity score of the cluster (acp)	158	- purity_cluster_score: the purity score of the cluster (acp)
159	- K: the overall evaluation criterion (sqrt(asp * acp))	159	- K: the overall evaluation criterion (sqrt(asp * acp))
160		160
161	This function is based on the following article:	161	This function is based on the following article:
162	Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan	162	Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan
163	'''	163	'''
164		164
165	def divide_line(a, divider):	165	def divide_line(a, divider):
166	'''	166	'''
167	Sub function used for dividing matrix by a vector line by line.	167	Sub function used for dividing matrix by a vector line by line.
168	'''	168	'''
169	return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)	169	return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
170		170
171	def compute_purity_score(count_matrix, axis=0):	171	def compute_purity_score(count_matrix, axis=0):
172	if axis==0:	172	if axis==0:
173	other_axis = 1	173	other_axis = 1
174	else:	174	else:
175	other_axis = 0	175	other_axis = 0
176	count_per_row = count_matrix.sum(axis=axis)	176	count_per_row = count_matrix.sum(axis=axis)
177	dividers = np.square(count_per_row)	177	dividers = np.square(count_per_row)
178		178
179	count_matrix_squared = np.square(count_matrix)	179	count_matrix_squared = np.square(count_matrix)
180	matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers)	180	matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers)
181	vector_purity = np.sum(matrix_divided, axis=axis)	181	vector_purity = np.sum(matrix_divided, axis=axis)
182		182
183	scalar_purity = np.average(vector_purity, weights=count_per_row)	183	scalar_purity = np.average(vector_purity, weights=count_per_row)
184	return (vector_purity, scalar_purity)	184	return scalar_purity
185		185
186		186
187	count_matrix = compute_count_matrix(y_truth, y_hat)	187	count_matrix = compute_count_matrix(y_truth, y_hat)
188	_, purity_cluster_score = compute_purity_score(count_matrix, 1)	188	_, purity_cluster_score = compute_purity_score(count_matrix, 1)
189	_, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)	189	_, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
190		190
191	K = np.sqrt(purity_cluster_score * purity_class_score)	191	K = np.sqrt(purity_cluster_score * purity_class_score)
192		192
193	for i in range(count_matrix.shape[0]):	193	for i in range(count_matrix.shape[0]):
194	for j in range(count_matrix.shape[1]):	194	for j in range(count_matrix.shape[1]):
195	count_matrix[i][j]	195	count_matrix[i][j]
196	count_matrix[i]	196	count_matrix[i]
197	return {	197	return {
198	"purity_class_score": purity_class_score,	198	"purity_class_score": purity_class_score,
199	"purity_cluster_score": purity_cluster_score,	199	"purity_cluster_score": purity_cluster_score,
200	"K": K	200	"K": K
201	}	201	}
202		202
203		203
204	if __name__ == "__main__":	204	if __name__ == "__main__":
205	print("Purity test #1")	205	print("Purity test #1")
206	# Hypothesis	206	# Hypothesis
207	y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])	207	y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
208	# Truth	208	# Truth
209	y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])	209	y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
210		210
211	(result_matrix, result_vector, result) = entropy_score(y, y_hat)	211	(result_matrix, result_vector, result) = entropy_score(y, y_hat)
212	print(purity_score(y, y_hat))	212	print(purity_score(y, y_hat))
213		213
214	exit(1)	214	exit(1)
215	print("Purity test #2")	215	print("Purity test #2")
216	# Hypothesis	216	# Hypothesis
217	y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4])	217	y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4])
218	# Truth	218	# Truth
219	y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3])	219	y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3])
220		220
221	(result_matrix, result_vector, result) = entropy_score(y, y_hat)	221	(result_matrix, result_vector, result) = entropy_score(y, y_hat)
222	exit(1)	222	exit(1)
223	print("Result matrix: ")	223	print("Result matrix: ")
224	print(result_matrix)	224	print(result_matrix)
225	print("Result vector: ")	225	print("Result vector: ")
226	print(result_vector)	226	print(result_vector)
227	print("Result: ", result)	227	print("Result: ", result)