Commit 503bfd9274290724fe9f4f0668a8fea7134a071b

Authored by Mathias
1 parent aeff19f951
Exists in master

Add comments to the purity_score function

Showing 1 changed file with 9 additions and 0 deletions Inline Diff

1 ''' 1 '''
2 This module is a part of my library. 2 This module is a part of my library.
3 It aims to compute some measures for clustering. 3 It aims to compute some measures for clustering.
4 ''' 4 '''
5 5
6 import numpy as np 6 import numpy as np
7 7
8 def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): 8 def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
9 ''' 9 '''
10 Compute disequilibrium for all the clusters. 10 Compute disequilibrium for all the clusters.
11 The disequilibrium is compute from the difference 11 The disequilibrium is compute from the difference
12 between two clustering sets. 12 between two clustering sets.
13 isGlobal permet à l'utilisateur de choisir le dénominateur de 13 isGlobal permet à l'utilisateur de choisir le dénominateur de
14 la fonction : 14 la fonction :
15 - True : divise la valeur par le nombre d'élément du cluster 15 - True : divise la valeur par le nombre d'élément du cluster
16 - False : divise la valeur par le nombre d'élément total 16 - False : divise la valeur par le nombre d'élément total
17 17
18 withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou 18 withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
19 une valeur absolue. 19 une valeur absolue.
20 ''' 20 '''
21 21
22 def divide_line(a, divider): 22 def divide_line(a, divider):
23 ''' 23 '''
24 Sub function used for dividing matrix by a vector line by line. 24 Sub function used for dividing matrix by a vector line by line.
25 ''' 25 '''
26 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) 26 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
27 27
28 dividers1 = 0 28 dividers1 = 0
29 dividers2 = 0 29 dividers2 = 0
30 30
31 if isGlobal: 31 if isGlobal:
32 dividers1 = matrix1.sum() 32 dividers1 = matrix1.sum()
33 dividers2 = matrix2.sum() 33 dividers2 = matrix2.sum()
34 else: 34 else:
35 dividers1 = matrix1.sum(axis=1) 35 dividers1 = matrix1.sum(axis=1)
36 dividers2 = matrix2.sum(axis=1) 36 dividers2 = matrix2.sum(axis=1)
37 37
38 matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) 38 matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
39 39
40 matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) 40 matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
41 41
42 diff = matrix1_divided - matrix2_divided 42 diff = matrix1_divided - matrix2_divided
43 43
44 mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0)) 44 mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
45 45
46 result = diff 46 result = diff
47 47
48 if mod != None or mod == "": 48 if mod != None or mod == "":
49 for word in mod.split(" "): 49 for word in mod.split(" "):
50 if word == "power": 50 if word == "power":
51 result = np.power(result,2) 51 result = np.power(result,2)
52 elif word == "human": 52 elif word == "human":
53 result = result * 100 53 result = result * 100
54 elif word == "abs": 54 elif word == "abs":
55 result = np.absolute(result) 55 result = np.absolute(result)
56 else: 56 else:
57 raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") 57 raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
58 return (mask, result) 58 return (mask, result)
59 59
60 60
61 61
62 def disequilibrium_mean_by_cluster(mask, matrix): 62 def disequilibrium_mean_by_cluster(mask, matrix):
63 ''' 63 '''
64 Mean of disequilibrium 64 Mean of disequilibrium
65 matrix is the disequilibrium calculated 65 matrix is the disequilibrium calculated
66 from number of occurences belonging to a class, 66 from number of occurences belonging to a class,
67 for each cluster. 67 for each cluster.
68 ''' 68 '''
69 nb_k = len(matrix) 69 nb_k = len(matrix)
70 results = np.zeros((nb_k)) 70 results = np.zeros((nb_k))
71 71
72 for i in range(nb_k): 72 for i in range(nb_k):
73 results[i] = matrix[i].sum() / mask[i].sum() 73 results[i] = matrix[i].sum() / mask[i].sum()
74 return results 74 return results
75 75
76 76
77 def disequilibrium(matrix1, matrix2, isGlobal=False): 77 def disequilibrium(matrix1, matrix2, isGlobal=False):
78 ''' 78 '''
79 Disequilibrium matrix 79 Disequilibrium matrix
80 And Disequilibrium value 80 And Disequilibrium value
81 ''' 81 '''
82 mask, result = disequilibrium_(matrix1, matrix2, isGlobal) 82 mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
83 result_human = result * 100 83 result_human = result * 100
84 result_power = np.power(result, 2) 84 result_power = np.power(result, 2)
85 85
86 return ( 86 return (
87 mask, 87 mask,
88 result_human, 88 result_human,
89 disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0] 89 disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
90 ) 90 )
91 91
92 92
93 def compute_count_matrix(y_truth, y_hat): 93 def compute_count_matrix(y_truth, y_hat):
94 ''' 94 '''
95 Check the size of the lists with assertion 95 Check the size of the lists with assertion
96 ''' 96 '''
97 # Check size of the lists 97 # Check size of the lists
98 assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}" 98 assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
99 99
100 # Build count matrix 100 # Build count matrix
101 count_matrix = np.zeros((max(y_hat+1), max(y_truth+1))) 101 count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
102 for i in range(len(y_hat)): 102 for i in range(len(y_hat)):
103 count_matrix[y_hat[i]][y_truth[i]] += 1 103 count_matrix[y_hat[i]][y_truth[i]] += 1
104 return count_matrix 104 return count_matrix
105 105
106 106
107 def entropy_score(y_truth, y_hat): 107 def entropy_score(y_truth, y_hat):
108 ''' 108 '''
109 Need to use label encoder before givin y_hat and y_truth 109 Need to use label encoder before givin y_hat and y_truth
110 Don't use one hot labels 110 Don't use one hot labels
111 111
112 Return a tuple with: 112 Return a tuple with:
113 - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x))) 113 - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
114 - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster. 114 - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
115 - result : the final entropy measure of the clustering 115 - result : the final entropy measure of the clustering
116 ''' 116 '''
117 def divide_line(a, divider): 117 def divide_line(a, divider):
118 ''' 118 '''
119 Sub function used for dividing matrix by a vector line by line. 119 Sub function used for dividing matrix by a vector line by line.
120 ''' 120 '''
121 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) 121 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
122 122
123 # Build count matrix 123 # Build count matrix
124 count_matrix = compute_count_matrix(y_truth, y_hat) 124 count_matrix = compute_count_matrix(y_truth, y_hat)
125 125
126 # Build dividers vector 126 # Build dividers vector
127 dividers = count_matrix.sum(axis=1) 127 dividers = count_matrix.sum(axis=1)
128 128
129 matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) 129 matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
130 130
131 log_matrix = np.zeros(matrix_divided.shape) 131 log_matrix = np.zeros(matrix_divided.shape)
132 np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0) 132 np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
133 result_matrix = -1 * np.multiply(matrix_divided, log_matrix) 133 result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
134 result_vector = result_matrix.sum(axis=1) 134 result_vector = result_matrix.sum(axis=1)
135 result_vector.sum() 135 result_vector.sum()
136 136
137 if np.isnan(np.sum(result_vector)): 137 if np.isnan(np.sum(result_vector)):
138 print("COUNT MATRIX") 138 print("COUNT MATRIX")
139 print(count_matrix) 139 print(count_matrix)
140 print("MATRIX DIVIDED") 140 print("MATRIX DIVIDED")
141 print(matrix_divided) 141 print(matrix_divided)
142 print("RESULT MATRIX") 142 print("RESULT MATRIX")
143 print(result_matrix) 143 print(result_matrix)
144 print("VECTOR MATRIX") 144 print("VECTOR MATRIX")
145 print(result_vector) 145 print(result_vector)
146 print("An error occured due to nan value, some values are printed before") 146 print("An error occured due to nan value, some values are printed before")
147 exit(1) 147 exit(1)
148 148
149 result = result_vector * dividers / dividers.sum() 149 result = result_vector * dividers / dividers.sum()
150 result = result.sum() 150 result = result.sum()
151 return (result_matrix, result_vector, result) 151 return (result_matrix, result_vector, result)
152 152
153 153
154 def purity_score(y_truth, y_hat): 154 def purity_score(y_truth, y_hat):
155 '''
156 Return three values in a dictionary:
157 - purity_class_score: the purity score of the class (asp)
158 - purity_cluster_score: the purity score of the cluster (acp)
159 - K: the overall evaluation criterion (sqrt(asp * acp))
160
161 This function is based on the following article:
162 Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan
163 '''
155 164
156 def divide_line(a, divider): 165 def divide_line(a, divider):
157 ''' 166 '''
158 Sub function used for dividing matrix by a vector line by line. 167 Sub function used for dividing matrix by a vector line by line.
159 ''' 168 '''
160 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) 169 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
161 170
162 def compute_purity_score(count_matrix, axis=0): 171 def compute_purity_score(count_matrix, axis=0):
163 count_per_row = count_matrix.sum(axis=axis) 172 count_per_row = count_matrix.sum(axis=axis)
164 dividers = np.square(count_per_row) 173 dividers = np.square(count_per_row)
165 count_matrix_squared = np.square(count_matrix) 174 count_matrix_squared = np.square(count_matrix)
166 matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix_squared, dtype=np.float), dividers) 175 matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix_squared, dtype=np.float), dividers)
167 vector_purity = np.sum(matrix_divided, axis=axis) 176 vector_purity = np.sum(matrix_divided, axis=axis)
168 177
169 scalar_purity = np.average(vector_purity, weights=count_per_row) 178 scalar_purity = np.average(vector_purity, weights=count_per_row)
170 return (vector_purity, scalar_purity) 179 return (vector_purity, scalar_purity)
171 180
172 181
173 count_matrix = compute_count_matrix(y_truth, y_hat) 182 count_matrix = compute_count_matrix(y_truth, y_hat)
174 _, purity_cluster_score = compute_purity_score(count_matrix, 1) 183 _, purity_cluster_score = compute_purity_score(count_matrix, 1)
175 _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0) 184 _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
176 185
177 K = np.sqrt(purity_cluster_score * purity_class_score) 186 K = np.sqrt(purity_cluster_score * purity_class_score)
178 187
179 for i in range(count_matrix.shape[0]): 188 for i in range(count_matrix.shape[0]):
180 189
181 for j in range(count_matrix.shape[1]): 190 for j in range(count_matrix.shape[1]):
182 count_matrix[i][j] 191 count_matrix[i][j]
183 count_matrix[i] 192 count_matrix[i]
184 return { 193 return {
185 "purity_class_score": purity_class_score, 194 "purity_class_score": purity_class_score,
186 "purity_cluster_score": purity_cluster_score, 195 "purity_cluster_score": purity_cluster_score,
187 "K": K 196 "K": K
188 } 197 }
189 198
190 199
191 if __name__ == "__main__": 200 if __name__ == "__main__":
192 # Hypothesis 201 # Hypothesis
193 y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0]) 202 y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
194 # Truth 203 # Truth
195 y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]) 204 y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
196 205
197 (result_matrix, result_vector, result) = entropy_score(y, y_hat) 206 (result_matrix, result_vector, result) = entropy_score(y, y_hat)
198 207
199 208
200 print(purity_score(y, y_hat)) 209 print(purity_score(y, y_hat))
201 exit(1) 210 exit(1)
202 print("Result matrix: ") 211 print("Result matrix: ")
203 print(result_matrix) 212 print(result_matrix)
204 print("Result vector: ") 213 print("Result vector: ")
205 print(result_vector) 214 print(result_vector)
206 print("Result: ", result) 215 print("Result: ", result)