Commit 503bfd9274290724fe9f4f0668a8fea7134a071b
1 parent
aeff19f951
Exists in
master
Add comments to the purity_score function
Showing 1 changed file with 9 additions and 0 deletions Inline Diff
volia/measures.py
1 | ''' | 1 | ''' |
2 | This module is a part of my library. | 2 | This module is a part of my library. |
3 | It aims to compute some measures for clustering. | 3 | It aims to compute some measures for clustering. |
4 | ''' | 4 | ''' |
5 | 5 | ||
6 | import numpy as np | 6 | import numpy as np |
7 | 7 | ||
8 | def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): | 8 | def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): |
9 | ''' | 9 | ''' |
10 | Compute disequilibrium for all the clusters. | 10 | Compute disequilibrium for all the clusters. |
11 | The disequilibrium is compute from the difference | 11 | The disequilibrium is compute from the difference |
12 | between two clustering sets. | 12 | between two clustering sets. |
13 | isGlobal permet à l'utilisateur de choisir le dénominateur de | 13 | isGlobal permet à l'utilisateur de choisir le dénominateur de |
14 | la fonction : | 14 | la fonction : |
15 | - True : divise la valeur par le nombre d'élément du cluster | 15 | - True : divise la valeur par le nombre d'élément du cluster |
16 | - False : divise la valeur par le nombre d'élément total | 16 | - False : divise la valeur par le nombre d'élément total |
17 | 17 | ||
18 | withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou | 18 | withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou |
19 | une valeur absolue. | 19 | une valeur absolue. |
20 | ''' | 20 | ''' |
21 | 21 | ||
22 | def divide_line(a, divider): | 22 | def divide_line(a, divider): |
23 | ''' | 23 | ''' |
24 | Sub function used for dividing matrix by a vector line by line. | 24 | Sub function used for dividing matrix by a vector line by line. |
25 | ''' | 25 | ''' |
26 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | 26 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) |
27 | 27 | ||
28 | dividers1 = 0 | 28 | dividers1 = 0 |
29 | dividers2 = 0 | 29 | dividers2 = 0 |
30 | 30 | ||
31 | if isGlobal: | 31 | if isGlobal: |
32 | dividers1 = matrix1.sum() | 32 | dividers1 = matrix1.sum() |
33 | dividers2 = matrix2.sum() | 33 | dividers2 = matrix2.sum() |
34 | else: | 34 | else: |
35 | dividers1 = matrix1.sum(axis=1) | 35 | dividers1 = matrix1.sum(axis=1) |
36 | dividers2 = matrix2.sum(axis=1) | 36 | dividers2 = matrix2.sum(axis=1) |
37 | 37 | ||
38 | matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) | 38 | matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) |
39 | 39 | ||
40 | matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) | 40 | matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) |
41 | 41 | ||
42 | diff = matrix1_divided - matrix2_divided | 42 | diff = matrix1_divided - matrix2_divided |
43 | 43 | ||
44 | mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0)) | 44 | mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0)) |
45 | 45 | ||
46 | result = diff | 46 | result = diff |
47 | 47 | ||
48 | if mod != None or mod == "": | 48 | if mod != None or mod == "": |
49 | for word in mod.split(" "): | 49 | for word in mod.split(" "): |
50 | if word == "power": | 50 | if word == "power": |
51 | result = np.power(result,2) | 51 | result = np.power(result,2) |
52 | elif word == "human": | 52 | elif word == "human": |
53 | result = result * 100 | 53 | result = result * 100 |
54 | elif word == "abs": | 54 | elif word == "abs": |
55 | result = np.absolute(result) | 55 | result = np.absolute(result) |
56 | else: | 56 | else: |
57 | raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") | 57 | raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") |
58 | return (mask, result) | 58 | return (mask, result) |
59 | 59 | ||
60 | 60 | ||
61 | 61 | ||
62 | def disequilibrium_mean_by_cluster(mask, matrix): | 62 | def disequilibrium_mean_by_cluster(mask, matrix): |
63 | ''' | 63 | ''' |
64 | Mean of disequilibrium | 64 | Mean of disequilibrium |
65 | matrix is the disequilibrium calculated | 65 | matrix is the disequilibrium calculated |
66 | from number of occurences belonging to a class, | 66 | from number of occurences belonging to a class, |
67 | for each cluster. | 67 | for each cluster. |
68 | ''' | 68 | ''' |
69 | nb_k = len(matrix) | 69 | nb_k = len(matrix) |
70 | results = np.zeros((nb_k)) | 70 | results = np.zeros((nb_k)) |
71 | 71 | ||
72 | for i in range(nb_k): | 72 | for i in range(nb_k): |
73 | results[i] = matrix[i].sum() / mask[i].sum() | 73 | results[i] = matrix[i].sum() / mask[i].sum() |
74 | return results | 74 | return results |
75 | 75 | ||
76 | 76 | ||
77 | def disequilibrium(matrix1, matrix2, isGlobal=False): | 77 | def disequilibrium(matrix1, matrix2, isGlobal=False): |
78 | ''' | 78 | ''' |
79 | Disequilibrium matrix | 79 | Disequilibrium matrix |
80 | And Disequilibrium value | 80 | And Disequilibrium value |
81 | ''' | 81 | ''' |
82 | mask, result = disequilibrium_(matrix1, matrix2, isGlobal) | 82 | mask, result = disequilibrium_(matrix1, matrix2, isGlobal) |
83 | result_human = result * 100 | 83 | result_human = result * 100 |
84 | result_power = np.power(result, 2) | 84 | result_power = np.power(result, 2) |
85 | 85 | ||
86 | return ( | 86 | return ( |
87 | mask, | 87 | mask, |
88 | result_human, | 88 | result_human, |
89 | disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0] | 89 | disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0] |
90 | ) | 90 | ) |
91 | 91 | ||
92 | 92 | ||
93 | def compute_count_matrix(y_truth, y_hat): | 93 | def compute_count_matrix(y_truth, y_hat): |
94 | ''' | 94 | ''' |
95 | Check the size of the lists with assertion | 95 | Check the size of the lists with assertion |
96 | ''' | 96 | ''' |
97 | # Check size of the lists | 97 | # Check size of the lists |
98 | assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}" | 98 | assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}" |
99 | 99 | ||
100 | # Build count matrix | 100 | # Build count matrix |
101 | count_matrix = np.zeros((max(y_hat+1), max(y_truth+1))) | 101 | count_matrix = np.zeros((max(y_hat+1), max(y_truth+1))) |
102 | for i in range(len(y_hat)): | 102 | for i in range(len(y_hat)): |
103 | count_matrix[y_hat[i]][y_truth[i]] += 1 | 103 | count_matrix[y_hat[i]][y_truth[i]] += 1 |
104 | return count_matrix | 104 | return count_matrix |
105 | 105 | ||
106 | 106 | ||
107 | def entropy_score(y_truth, y_hat): | 107 | def entropy_score(y_truth, y_hat): |
108 | ''' | 108 | ''' |
109 | Need to use label encoder before givin y_hat and y_truth | 109 | Need to use label encoder before givin y_hat and y_truth |
110 | Don't use one hot labels | 110 | Don't use one hot labels |
111 | 111 | ||
112 | Return a tuple with: | 112 | Return a tuple with: |
113 | - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x))) | 113 | - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x))) |
114 | - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster. | 114 | - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster. |
115 | - result : the final entropy measure of the clustering | 115 | - result : the final entropy measure of the clustering |
116 | ''' | 116 | ''' |
117 | def divide_line(a, divider): | 117 | def divide_line(a, divider): |
118 | ''' | 118 | ''' |
119 | Sub function used for dividing matrix by a vector line by line. | 119 | Sub function used for dividing matrix by a vector line by line. |
120 | ''' | 120 | ''' |
121 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | 121 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) |
122 | 122 | ||
123 | # Build count matrix | 123 | # Build count matrix |
124 | count_matrix = compute_count_matrix(y_truth, y_hat) | 124 | count_matrix = compute_count_matrix(y_truth, y_hat) |
125 | 125 | ||
126 | # Build dividers vector | 126 | # Build dividers vector |
127 | dividers = count_matrix.sum(axis=1) | 127 | dividers = count_matrix.sum(axis=1) |
128 | 128 | ||
129 | matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) | 129 | matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) |
130 | 130 | ||
131 | log_matrix = np.zeros(matrix_divided.shape) | 131 | log_matrix = np.zeros(matrix_divided.shape) |
132 | np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0) | 132 | np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0) |
133 | result_matrix = -1 * np.multiply(matrix_divided, log_matrix) | 133 | result_matrix = -1 * np.multiply(matrix_divided, log_matrix) |
134 | result_vector = result_matrix.sum(axis=1) | 134 | result_vector = result_matrix.sum(axis=1) |
135 | result_vector.sum() | 135 | result_vector.sum() |
136 | 136 | ||
137 | if np.isnan(np.sum(result_vector)): | 137 | if np.isnan(np.sum(result_vector)): |
138 | print("COUNT MATRIX") | 138 | print("COUNT MATRIX") |
139 | print(count_matrix) | 139 | print(count_matrix) |
140 | print("MATRIX DIVIDED") | 140 | print("MATRIX DIVIDED") |
141 | print(matrix_divided) | 141 | print(matrix_divided) |
142 | print("RESULT MATRIX") | 142 | print("RESULT MATRIX") |
143 | print(result_matrix) | 143 | print(result_matrix) |
144 | print("VECTOR MATRIX") | 144 | print("VECTOR MATRIX") |
145 | print(result_vector) | 145 | print(result_vector) |
146 | print("An error occured due to nan value, some values are printed before") | 146 | print("An error occured due to nan value, some values are printed before") |
147 | exit(1) | 147 | exit(1) |
148 | 148 | ||
149 | result = result_vector * dividers / dividers.sum() | 149 | result = result_vector * dividers / dividers.sum() |
150 | result = result.sum() | 150 | result = result.sum() |
151 | return (result_matrix, result_vector, result) | 151 | return (result_matrix, result_vector, result) |
152 | 152 | ||
153 | 153 | ||
154 | def purity_score(y_truth, y_hat): | 154 | def purity_score(y_truth, y_hat): |
155 | ''' | ||
156 | Return three values in a dictionary: | ||
157 | - purity_class_score: the purity score of the class (asp) | ||
158 | - purity_cluster_score: the purity score of the cluster (acp) | ||
159 | - K: the overall evaluation criterion (sqrt(asp * acp)) | ||
160 | |||
161 | This function is based on the following article: | ||
162 | Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan | ||
163 | ''' | ||
155 | 164 | ||
156 | def divide_line(a, divider): | 165 | def divide_line(a, divider): |
157 | ''' | 166 | ''' |
158 | Sub function used for dividing matrix by a vector line by line. | 167 | Sub function used for dividing matrix by a vector line by line. |
159 | ''' | 168 | ''' |
160 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | 169 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) |
161 | 170 | ||
162 | def compute_purity_score(count_matrix, axis=0): | 171 | def compute_purity_score(count_matrix, axis=0): |
163 | count_per_row = count_matrix.sum(axis=axis) | 172 | count_per_row = count_matrix.sum(axis=axis) |
164 | dividers = np.square(count_per_row) | 173 | dividers = np.square(count_per_row) |
165 | count_matrix_squared = np.square(count_matrix) | 174 | count_matrix_squared = np.square(count_matrix) |
166 | matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix_squared, dtype=np.float), dividers) | 175 | matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix_squared, dtype=np.float), dividers) |
167 | vector_purity = np.sum(matrix_divided, axis=axis) | 176 | vector_purity = np.sum(matrix_divided, axis=axis) |
168 | 177 | ||
169 | scalar_purity = np.average(vector_purity, weights=count_per_row) | 178 | scalar_purity = np.average(vector_purity, weights=count_per_row) |
170 | return (vector_purity, scalar_purity) | 179 | return (vector_purity, scalar_purity) |
171 | 180 | ||
172 | 181 | ||
173 | count_matrix = compute_count_matrix(y_truth, y_hat) | 182 | count_matrix = compute_count_matrix(y_truth, y_hat) |
174 | _, purity_cluster_score = compute_purity_score(count_matrix, 1) | 183 | _, purity_cluster_score = compute_purity_score(count_matrix, 1) |
175 | _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0) | 184 | _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0) |
176 | 185 | ||
177 | K = np.sqrt(purity_cluster_score * purity_class_score) | 186 | K = np.sqrt(purity_cluster_score * purity_class_score) |
178 | 187 | ||
179 | for i in range(count_matrix.shape[0]): | 188 | for i in range(count_matrix.shape[0]): |
180 | 189 | ||
181 | for j in range(count_matrix.shape[1]): | 190 | for j in range(count_matrix.shape[1]): |
182 | count_matrix[i][j] | 191 | count_matrix[i][j] |
183 | count_matrix[i] | 192 | count_matrix[i] |
184 | return { | 193 | return { |
185 | "purity_class_score": purity_class_score, | 194 | "purity_class_score": purity_class_score, |
186 | "purity_cluster_score": purity_cluster_score, | 195 | "purity_cluster_score": purity_cluster_score, |
187 | "K": K | 196 | "K": K |
188 | } | 197 | } |
189 | 198 | ||
190 | 199 | ||
191 | if __name__ == "__main__": | 200 | if __name__ == "__main__": |
192 | # Hypothesis | 201 | # Hypothesis |
193 | y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0]) | 202 | y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0]) |
194 | # Truth | 203 | # Truth |
195 | y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]) | 204 | y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]) |
196 | 205 | ||
197 | (result_matrix, result_vector, result) = entropy_score(y, y_hat) | 206 | (result_matrix, result_vector, result) = entropy_score(y, y_hat) |
198 | 207 | ||
199 | 208 | ||
200 | print(purity_score(y, y_hat)) | 209 | print(purity_score(y, y_hat)) |
201 | exit(1) | 210 | exit(1) |
202 | print("Result matrix: ") | 211 | print("Result matrix: ") |
203 | print(result_matrix) | 212 | print(result_matrix) |
204 | print("Result vector: ") | 213 | print("Result vector: ") |
205 | print(result_vector) | 214 | print(result_vector) |
206 | print("Result: ", result) | 215 | print("Result: ", result) |