Commit 29318d99165a89b6bece42ad02ed4f878753008a

Authored by Mathias
1 parent 4aa3a0ea73
Exists in master

Repair error about the definition of the axis for the multiplication

Showing 1 changed file with 15 additions and 3 deletions Inline Diff

1 ''' 1 '''
2 This module is a part of my library. 2 This module is a part of my library.
3 It aims to compute some measures for clustering. 3 It aims to compute some measures for clustering.
4 ''' 4 '''
5 5
6 import numpy as np 6 import numpy as np
7 7
8 def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): 8 def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
9 ''' 9 '''
10 Compute disequilibrium for all the clusters. 10 Compute disequilibrium for all the clusters.
11 The disequilibrium is compute from the difference 11 The disequilibrium is compute from the difference
12 between two clustering sets. 12 between two clustering sets.
13 isGlobal permet à l'utilisateur de choisir le dénominateur de 13 isGlobal permet à l'utilisateur de choisir le dénominateur de
14 la fonction : 14 la fonction :
15 - True : divise la valeur par le nombre d'élément du cluster 15 - True : divise la valeur par le nombre d'élément du cluster
16 - False : divise la valeur par le nombre d'élément total 16 - False : divise la valeur par le nombre d'élément total
17 17
18 withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou 18 withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
19 une valeur absolue. 19 une valeur absolue.
20 ''' 20 '''
21 21
22 def divide_line(a, divider): 22 def divide_line(a, divider):
23 ''' 23 '''
24 Sub function used for dividing matrix by a vector line by line. 24 Sub function used for dividing matrix by a vector line by line.
25 ''' 25 '''
26 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) 26 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
27 27
28 dividers1 = 0 28 dividers1 = 0
29 dividers2 = 0 29 dividers2 = 0
30 30
31 if isGlobal: 31 if isGlobal:
32 dividers1 = matrix1.sum() 32 dividers1 = matrix1.sum()
33 dividers2 = matrix2.sum() 33 dividers2 = matrix2.sum()
34 else: 34 else:
35 dividers1 = matrix1.sum(axis=1) 35 dividers1 = matrix1.sum(axis=1)
36 dividers2 = matrix2.sum(axis=1) 36 dividers2 = matrix2.sum(axis=1)
37 37
38 matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) 38 matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
39 39
40 matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) 40 matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
41 41
42 diff = matrix1_divided - matrix2_divided 42 diff = matrix1_divided - matrix2_divided
43 43
44 mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0)) 44 mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
45 45
46 result = diff 46 result = diff
47 47
48 if mod != None or mod == "": 48 if mod != None or mod == "":
49 for word in mod.split(" "): 49 for word in mod.split(" "):
50 if word == "power": 50 if word == "power":
51 result = np.power(result,2) 51 result = np.power(result,2)
52 elif word == "human": 52 elif word == "human":
53 result = result * 100 53 result = result * 100
54 elif word == "abs": 54 elif word == "abs":
55 result = np.absolute(result) 55 result = np.absolute(result)
56 else: 56 else:
57 raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") 57 raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
58 return (mask, result) 58 return (mask, result)
59 59
60 60
61 61
62 def disequilibrium_mean_by_cluster(mask, matrix): 62 def disequilibrium_mean_by_cluster(mask, matrix):
63 ''' 63 '''
64 Mean of disequilibrium 64 Mean of disequilibrium
65 matrix is the disequilibrium calculated 65 matrix is the disequilibrium calculated
66 from number of occurences belonging to a class, 66 from number of occurences belonging to a class,
67 for each cluster. 67 for each cluster.
68 ''' 68 '''
69 nb_k = len(matrix) 69 nb_k = len(matrix)
70 results = np.zeros((nb_k)) 70 results = np.zeros((nb_k))
71 71
72 for i in range(nb_k): 72 for i in range(nb_k):
73 results[i] = matrix[i].sum() / mask[i].sum() 73 results[i] = matrix[i].sum() / mask[i].sum()
74 return results 74 return results
75 75
76 76
77 def disequilibrium(matrix1, matrix2, isGlobal=False): 77 def disequilibrium(matrix1, matrix2, isGlobal=False):
78 ''' 78 '''
79 Disequilibrium matrix 79 Disequilibrium matrix
80 And Disequilibrium value 80 And Disequilibrium value
81 ''' 81 '''
82 mask, result = disequilibrium_(matrix1, matrix2, isGlobal) 82 mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
83 result_human = result * 100 83 result_human = result * 100
84 result_power = np.power(result, 2) 84 result_power = np.power(result, 2)
85 85
86 return ( 86 return (
87 mask, 87 mask,
88 result_human, 88 result_human,
89 disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0] 89 disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
90 ) 90 )
91 91
92 92
93 def compute_count_matrix(y_truth, y_hat): 93 def compute_count_matrix(y_truth, y_hat):
94 ''' 94 '''
95 Check the size of the lists with assertion 95 Check the size of the lists with assertion
96 ''' 96 '''
97 # Check size of the lists 97 # Check size of the lists
98 assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}" 98 assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
99 99
100 # Build count matrix 100 # Build count matrix
101 count_matrix = np.zeros((max(y_hat+1), max(y_truth+1))) 101 count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
102 for i in range(len(y_hat)): 102 for i in range(len(y_hat)):
103 count_matrix[y_hat[i]][y_truth[i]] += 1 103 count_matrix[y_hat[i]][y_truth[i]] += 1
104 return count_matrix 104 return count_matrix
105 105
106 106
107 def entropy_score(y_truth, y_hat): 107 def entropy_score(y_truth, y_hat):
108 ''' 108 '''
109 Need to use label encoder before givin y_hat and y_truth 109 Need to use label encoder before givin y_hat and y_truth
110 Don't use one hot labels 110 Don't use one hot labels
111 111
112 Return a tuple with: 112 Return a tuple with:
113 - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x))) 113 - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
114 - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster. 114 - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
115 - result : the final entropy measure of the clustering 115 - result : the final entropy measure of the clustering
116 ''' 116 '''
117 def divide_line(a, divider): 117 def divide_line(a, divider):
118 ''' 118 '''
119 Sub function used for dividing matrix by a vector line by line. 119 Sub function used for dividing matrix by a vector line by line.
120 ''' 120 '''
121 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) 121 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
122 122
123 # Build count matrix 123 # Build count matrix
124 count_matrix = compute_count_matrix(y_truth, y_hat) 124 count_matrix = compute_count_matrix(y_truth, y_hat)
125 125
126 # Build dividers vector 126 # Build dividers vector
127 dividers = count_matrix.sum(axis=1) 127 dividers = count_matrix.sum(axis=1)
128 128
129 matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) 129 matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
130 130
131 log_matrix = np.zeros(matrix_divided.shape) 131 log_matrix = np.zeros(matrix_divided.shape)
132 np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0) 132 np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
133 result_matrix = -1 * np.multiply(matrix_divided, log_matrix) 133 result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
134 result_vector = result_matrix.sum(axis=1) 134 result_vector = result_matrix.sum(axis=1)
135 result_vector.sum() 135 result_vector.sum()
136 136
137 if np.isnan(np.sum(result_vector)): 137 if np.isnan(np.sum(result_vector)):
138 print("COUNT MATRIX") 138 print("COUNT MATRIX")
139 print(count_matrix) 139 print(count_matrix)
140 print("MATRIX DIVIDED") 140 print("MATRIX DIVIDED")
141 print(matrix_divided) 141 print(matrix_divided)
142 print("RESULT MATRIX") 142 print("RESULT MATRIX")
143 print(result_matrix) 143 print(result_matrix)
144 print("VECTOR MATRIX") 144 print("VECTOR MATRIX")
145 print(result_vector) 145 print(result_vector)
146 print("An error occured due to nan value, some values are printed before") 146 print("An error occured due to nan value, some values are printed before")
147 exit(1) 147 exit(1)
148 148
149 result = result_vector * dividers / dividers.sum() 149 result = result_vector * dividers / dividers.sum()
150 result = result.sum() 150 result = result.sum()
151 return (result_matrix, result_vector, result) 151 return (result_matrix, result_vector, result)
152 152
153 153
154 def purity_score(y_truth, y_hat): 154 def purity_score(y_truth, y_hat):
155 ''' 155 '''
156 Return three values in a dictionary: 156 Return three values in a dictionary:
157 - purity_class_score: the purity score of the class (asp) 157 - purity_class_score: the purity score of the class (asp)
158 - purity_cluster_score: the purity score of the cluster (acp) 158 - purity_cluster_score: the purity score of the cluster (acp)
159 - K: the overall evaluation criterion (sqrt(asp * acp)) 159 - K: the overall evaluation criterion (sqrt(asp * acp))
160 160
161 This function is based on the following article: 161 This function is based on the following article:
162 Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan 162 Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan
163 ''' 163 '''
164 164
165 def divide_line(a, divider): 165 def divide_line(a, divider):
166 ''' 166 '''
167 Sub function used for dividing matrix by a vector line by line. 167 Sub function used for dividing matrix by a vector line by line.
168 ''' 168 '''
169 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) 169 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
170 170
171 def compute_purity_score(count_matrix, axis=0): 171 def compute_purity_score(count_matrix, axis=0):
172 if axis==0:
173 other_axis = 1
174 else:
175 other_axis = 0
172 count_per_row = count_matrix.sum(axis=axis) 176 count_per_row = count_matrix.sum(axis=axis)
173 dividers = np.square(count_per_row) 177 dividers = np.square(count_per_row)
178
174 count_matrix_squared = np.square(count_matrix) 179 count_matrix_squared = np.square(count_matrix)
175 matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix_squared, dtype=np.float), dividers) 180 matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers)
176 vector_purity = np.sum(matrix_divided, axis=axis) 181 vector_purity = np.sum(matrix_divided, axis=axis)
177 182
178 scalar_purity = np.average(vector_purity, weights=count_per_row) 183 scalar_purity = np.average(vector_purity, weights=count_per_row)
179 return (vector_purity, scalar_purity) 184 return (vector_purity, scalar_purity)
180 185
181 186
182 count_matrix = compute_count_matrix(y_truth, y_hat) 187 count_matrix = compute_count_matrix(y_truth, y_hat)
183 _, purity_cluster_score = compute_purity_score(count_matrix, 1) 188 _, purity_cluster_score = compute_purity_score(count_matrix, 1)
184 _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0) 189 _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
185 190
186 K = np.sqrt(purity_cluster_score * purity_class_score) 191 K = np.sqrt(purity_cluster_score * purity_class_score)
187 192
188 for i in range(count_matrix.shape[0]): 193 for i in range(count_matrix.shape[0]):
189
190 for j in range(count_matrix.shape[1]): 194 for j in range(count_matrix.shape[1]):
191 count_matrix[i][j] 195 count_matrix[i][j]
192 count_matrix[i] 196 count_matrix[i]
193 return { 197 return {
194 "purity_class_score": purity_class_score, 198 "purity_class_score": purity_class_score,
195 "purity_cluster_score": purity_cluster_score, 199 "purity_cluster_score": purity_cluster_score,
196 "K": K 200 "K": K
197 } 201 }
198 202
199 203
200 if __name__ == "__main__": 204 if __name__ == "__main__":
205 print("Purity test #1")
201 # Hypothesis 206 # Hypothesis
202 y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0]) 207 y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
203 # Truth 208 # Truth
204 y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]) 209 y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
205 210
206 (result_matrix, result_vector, result) = entropy_score(y, y_hat) 211 (result_matrix, result_vector, result) = entropy_score(y, y_hat)
212 print(purity_score(y, y_hat))
207 213
214 exit(1)
215 print("Purity test #2")
216 # Hypothesis
217 y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4])
218 # Truth
219 y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3])
208 220
209 print(purity_score(y, y_hat)) 221 (result_matrix, result_vector, result) = entropy_score(y, y_hat)
210 exit(1) 222 exit(1)
211 print("Result matrix: ") 223 print("Result matrix: ")
212 print(result_matrix) 224 print(result_matrix)
213 print("Result vector: ") 225 print("Result vector: ")
214 print(result_vector) 226 print(result_vector)