Commit 3c07f672ad6b92371b677895a2866763cfef1c37

Authored by quillotm
1 parent fea9649a74
Exists in master

Solve an error with the purity score.

Showing 1 changed file with 3 additions and 2 deletions Inline Diff

volia/core/measures.py
1 ''' 1 '''
2 This module is a part of my library. 2 This module is a part of my library.
3 It aims to compute some measures for clustering. 3 It aims to compute some measures for clustering.
4 ''' 4 '''
5 5
6 import numpy as np 6 import numpy as np
7 7
8 def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): 8 def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
9 ''' 9 '''
10 Compute disequilibrium for all the clusters. 10 Compute disequilibrium for all the clusters.
11 The disequilibrium is compute from the difference 11 The disequilibrium is compute from the difference
12 between two clustering sets. 12 between two clustering sets.
13 isGlobal permet à l'utilisateur de choisir le dénominateur de 13 isGlobal permet à l'utilisateur de choisir le dénominateur de
14 la fonction : 14 la fonction :
15 - True : divise la valeur par le nombre d'élément du cluster 15 - True : divise la valeur par le nombre d'élément du cluster
16 - False : divise la valeur par le nombre d'élément total 16 - False : divise la valeur par le nombre d'élément total
17 17
18 withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou 18 withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
19 une valeur absolue. 19 une valeur absolue.
20 ''' 20 '''
21 21
22 def divide_line(a, divider): 22 def divide_line(a, divider):
23 ''' 23 '''
24 Sub function used for dividing matrix by a vector line by line. 24 Sub function used for dividing matrix by a vector line by line.
25 ''' 25 '''
26 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) 26 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
27 27
28 dividers1 = 0 28 dividers1 = 0
29 dividers2 = 0 29 dividers2 = 0
30 30
31 if isGlobal: 31 if isGlobal:
32 dividers1 = matrix1.sum() 32 dividers1 = matrix1.sum()
33 dividers2 = matrix2.sum() 33 dividers2 = matrix2.sum()
34 else: 34 else:
35 dividers1 = matrix1.sum(axis=1) 35 dividers1 = matrix1.sum(axis=1)
36 dividers2 = matrix2.sum(axis=1) 36 dividers2 = matrix2.sum(axis=1)
37 37
38 matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) 38 matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
39 39
40 matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) 40 matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
41 41
42 diff = matrix1_divided - matrix2_divided 42 diff = matrix1_divided - matrix2_divided
43 43
44 mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0)) 44 mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
45 45
46 result = diff 46 result = diff
47 47
48 if mod != None or mod == "": 48 if mod != None or mod == "":
49 for word in mod.split(" "): 49 for word in mod.split(" "):
50 if word == "power": 50 if word == "power":
51 result = np.power(result,2) 51 result = np.power(result,2)
52 elif word == "human": 52 elif word == "human":
53 result = result * 100 53 result = result * 100
54 elif word == "abs": 54 elif word == "abs":
55 result = np.absolute(result) 55 result = np.absolute(result)
56 else: 56 else:
57 raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") 57 raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
58 return (mask, result) 58 return (mask, result)
59 59
60 60
61 61
62 def disequilibrium_mean_by_cluster(mask, matrix): 62 def disequilibrium_mean_by_cluster(mask, matrix):
63 ''' 63 '''
64 Mean of disequilibrium 64 Mean of disequilibrium
65 matrix is the disequilibrium calculated 65 matrix is the disequilibrium calculated
66 from number of occurences belonging to a class, 66 from number of occurences belonging to a class,
67 for each cluster. 67 for each cluster.
68 ''' 68 '''
69 nb_k = len(matrix) 69 nb_k = len(matrix)
70 results = np.zeros((nb_k)) 70 results = np.zeros((nb_k))
71 71
72 for i in range(nb_k): 72 for i in range(nb_k):
73 results[i] = matrix[i].sum() / mask[i].sum() 73 results[i] = matrix[i].sum() / mask[i].sum()
74 return results 74 return results
75 75
76 76
77 def disequilibrium(matrix1, matrix2, isGlobal=False): 77 def disequilibrium(matrix1, matrix2, isGlobal=False):
78 ''' 78 '''
79 Disequilibrium matrix 79 Disequilibrium matrix
80 And Disequilibrium value 80 And Disequilibrium value
81 ''' 81 '''
82 mask, result = disequilibrium_(matrix1, matrix2, isGlobal) 82 mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
83 result_human = result * 100 83 result_human = result * 100
84 result_power = np.power(result, 2) 84 result_power = np.power(result, 2)
85 85
86 return ( 86 return (
87 mask, 87 mask,
88 result_human, 88 result_human,
89 disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0] 89 disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
90 ) 90 )
91 91
92 92
93 def compute_count_matrix(y_truth, y_hat): 93 def compute_count_matrix(y_truth, y_hat):
94 ''' 94 '''
95 Check the size of the lists with assertion 95 Check the size of the lists with assertion
96 ''' 96 '''
97 # Check size of the lists 97 # Check size of the lists
98 assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}" 98 assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
99 99
100 # Build count matrix 100 # Build count matrix
101 count_matrix = np.zeros((max(y_hat+1), max(y_truth+1))) 101 count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
102 for i in range(len(y_hat)): 102 for i in range(len(y_hat)):
103 count_matrix[y_hat[i]][y_truth[i]] += 1 103 count_matrix[y_hat[i]][y_truth[i]] += 1
104 return count_matrix 104 return count_matrix
105 105
106 106
107 def entropy_score(y_truth, y_hat): 107 def entropy_score(y_truth, y_hat):
108 ''' 108 '''
109 Need to use label encoder before givin y_hat and y_truth 109 Need to use label encoder before givin y_hat and y_truth
110 Don't use one hot labels 110 Don't use one hot labels
111 111
112 Return a tuple with: 112 Return a tuple with:
113 - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x))) 113 - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
114 - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster. 114 - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
115 - result : the final entropy measure of the clustering 115 - result : the final entropy measure of the clustering
116 ''' 116 '''
117 def divide_line(a, divider): 117 def divide_line(a, divider):
118 ''' 118 '''
119 Sub function used for dividing matrix by a vector line by line. 119 Sub function used for dividing matrix by a vector line by line.
120 ''' 120 '''
121 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) 121 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
122 122
123 # Build count matrix 123 # Build count matrix
124 count_matrix = compute_count_matrix(y_truth, y_hat) 124 count_matrix = compute_count_matrix(y_truth, y_hat)
125 125
126 # Build dividers vector 126 # Build dividers vector
127 dividers = count_matrix.sum(axis=1) 127 dividers = count_matrix.sum(axis=1)
128 128
129 matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) 129 matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
130 130
131 log_matrix = np.zeros(matrix_divided.shape) 131 log_matrix = np.zeros(matrix_divided.shape)
132 np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0) 132 np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
133 result_matrix = -1 * np.multiply(matrix_divided, log_matrix) 133 result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
134 result_vector = result_matrix.sum(axis=1) 134 result_vector = result_matrix.sum(axis=1)
135 result_vector.sum() 135 result_vector.sum()
136 136
137 if np.isnan(np.sum(result_vector)): 137 if np.isnan(np.sum(result_vector)):
138 print("COUNT MATRIX") 138 print("COUNT MATRIX")
139 print(count_matrix) 139 print(count_matrix)
140 print("MATRIX DIVIDED") 140 print("MATRIX DIVIDED")
141 print(matrix_divided) 141 print(matrix_divided)
142 print("RESULT MATRIX") 142 print("RESULT MATRIX")
143 print(result_matrix) 143 print(result_matrix)
144 print("VECTOR MATRIX") 144 print("VECTOR MATRIX")
145 print(result_vector) 145 print(result_vector)
146 print("An error occured due to nan value, some values are printed before") 146 print("An error occured due to nan value, some values are printed before")
147 exit(1) 147 exit(1)
148 148
149 result = result_vector * dividers / dividers.sum() 149 result = result_vector * dividers / dividers.sum()
150 result = result.sum() 150 result = result.sum()
151 return result 151 return result
152 152
153 153
154 def purity_score(y_truth, y_hat): 154 def purity_score(y_truth, y_hat):
155 ''' 155 '''
156 Return three values in a dictionary: 156 Return three values in a dictionary:
157 - purity_class_score: the purity score of the class (asp) 157 - purity_class_score: the purity score of the class (asp)
158 - purity_cluster_score: the purity score of the cluster (acp) 158 - purity_cluster_score: the purity score of the cluster (acp)
159 - K: the overall evaluation criterion (sqrt(asp * acp)) 159 - K: the overall evaluation criterion (sqrt(asp * acp))
160 160
161 This function is based on the following article: 161 This function is based on the following article:
162 Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan 162 Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan
163 ''' 163 '''
164 164
165 def divide_line(a, divider): 165 def divide_line(a, divider):
166 ''' 166 '''
167 Sub function used for dividing matrix by a vector line by line. 167 Sub function used for dividing matrix by a vector line by line.
168 ''' 168 '''
169 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) 169 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
170 170
171 def compute_purity_score(count_matrix, axis=0): 171 def compute_purity_score(count_matrix, axis=0):
172 if axis==0: 172 if axis==0:
173 other_axis = 1 173 other_axis = 1
174 else: 174 else:
175 other_axis = 0 175 other_axis = 0
176 count_per_row = count_matrix.sum(axis=axis) 176 count_per_row = count_matrix.sum(axis=axis)
177 dividers = np.square(count_per_row) 177 dividers = np.square(count_per_row)
178 178
179 count_matrix_squared = np.square(count_matrix) 179 count_matrix_squared = np.square(count_matrix)
180 matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers) 180 matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers)
181 vector_purity = np.sum(matrix_divided, axis=axis) 181 vector_purity = np.sum(matrix_divided, axis=axis)
182 182
183 scalar_purity = np.average(vector_purity, weights=count_per_row) 183 scalar_purity = np.average(vector_purity, weights=count_per_row)
184 return scalar_purity 184 return scalar_purity
185 185
186 186
187 count_matrix = compute_count_matrix(y_truth, y_hat) 187 count_matrix = compute_count_matrix(y_truth, y_hat)
188 _, purity_cluster_score = compute_purity_score(count_matrix, 1) 188
189 _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0) 189 purity_cluster_score = compute_purity_score(count_matrix, 1)
190 purity_class_score = compute_purity_score(count_matrix, 0)
190 191
191 K = np.sqrt(purity_cluster_score * purity_class_score) 192 K = np.sqrt(purity_cluster_score * purity_class_score)
192 193
193 for i in range(count_matrix.shape[0]): 194 for i in range(count_matrix.shape[0]):
194 for j in range(count_matrix.shape[1]): 195 for j in range(count_matrix.shape[1]):
195 count_matrix[i][j] 196 count_matrix[i][j]
196 count_matrix[i] 197 count_matrix[i]
197 return { 198 return {
198 "purity_class_score": purity_class_score, 199 "purity_class_score": purity_class_score,
199 "purity_cluster_score": purity_cluster_score, 200 "purity_cluster_score": purity_cluster_score,
200 "K": K 201 "K": K
201 } 202 }
202 203
203 204
204 if __name__ == "__main__": 205 if __name__ == "__main__":
205 print("Purity test #1") 206 print("Purity test #1")
206 # Hypothesis 207 # Hypothesis
207 y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0]) 208 y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
208 # Truth 209 # Truth
209 y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]) 210 y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
210 211
211 (result_matrix, result_vector, result) = entropy_score(y, y_hat) 212 (result_matrix, result_vector, result) = entropy_score(y, y_hat)
212 print(purity_score(y, y_hat)) 213 print(purity_score(y, y_hat))
213 214
214 exit(1) 215 exit(1)
215 print("Purity test #2") 216 print("Purity test #2")
216 # Hypothesis 217 # Hypothesis
217 y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4]) 218 y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4])
218 # Truth 219 # Truth
219 y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3]) 220 y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3])
220 221
221 (result_matrix, result_vector, result) = entropy_score(y, y_hat) 222 (result_matrix, result_vector, result) = entropy_score(y, y_hat)
222 exit(1) 223 exit(1)
223 print("Result matrix: ") 224 print("Result matrix: ")
224 print(result_matrix) 225 print(result_matrix)
225 print("Result vector: ") 226 print("Result vector: ")
226 print(result_vector) 227 print(result_vector)
227 print("Result: ", result) 228 print("Result: ", result)