Commit aeff19f9513b6f3f650ae1c36eff462c545fbf52

Authored by Mathias
1 parent 4ed3ebc7d7
Exists in master

purity measure added and tested

Showing 1 changed file with 42 additions and 3 deletions Inline Diff

1 ''' 1 '''
2 This module is a part of my library. 2 This module is a part of my library.
3 It aims to compute some measures for clustering. 3 It aims to compute some measures for clustering.
4 ''' 4 '''
5 5
6 import numpy as np 6 import numpy as np
7 7
8 def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): 8 def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
9 ''' 9 '''
10 Compute disequilibrium for all the clusters. 10 Compute disequilibrium for all the clusters.
11 The disequilibrium is compute from the difference 11 The disequilibrium is compute from the difference
12 between two clustering sets. 12 between two clustering sets.
13 isGlobal permet à l'utilisateur de choisir le dénominateur de 13 isGlobal permet à l'utilisateur de choisir le dénominateur de
14 la fonction : 14 la fonction :
15 - True : divise la valeur par le nombre d'élément du cluster 15 - True : divise la valeur par le nombre d'élément du cluster
16 - False : divise la valeur par le nombre d'élément total 16 - False : divise la valeur par le nombre d'élément total
17 17
18 withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou 18 withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
19 une valeur absolue. 19 une valeur absolue.
20 ''' 20 '''
21 21
22 def divide_line(a, divider): 22 def divide_line(a, divider):
23 ''' 23 '''
24 Sub function used for dividing matrix by a vector line by line. 24 Sub function used for dividing matrix by a vector line by line.
25 ''' 25 '''
26 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) 26 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
27 27
28 dividers1 = 0 28 dividers1 = 0
29 dividers2 = 0 29 dividers2 = 0
30 30
31 if isGlobal: 31 if isGlobal:
32 dividers1 = matrix1.sum() 32 dividers1 = matrix1.sum()
33 dividers2 = matrix2.sum() 33 dividers2 = matrix2.sum()
34 else: 34 else:
35 dividers1 = matrix1.sum(axis=1) 35 dividers1 = matrix1.sum(axis=1)
36 dividers2 = matrix2.sum(axis=1) 36 dividers2 = matrix2.sum(axis=1)
37 37
38 matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) 38 matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
39 39
40 matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) 40 matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
41 41
42 diff = matrix1_divided - matrix2_divided 42 diff = matrix1_divided - matrix2_divided
43 43
44 mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0)) 44 mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
45 45
46 result = diff 46 result = diff
47 47
48 if mod != None or mod == "": 48 if mod != None or mod == "":
49 for word in mod.split(" "): 49 for word in mod.split(" "):
50 if word == "power": 50 if word == "power":
51 result = np.power(result,2) 51 result = np.power(result,2)
52 elif word == "human": 52 elif word == "human":
53 result = result * 100 53 result = result * 100
54 elif word == "abs": 54 elif word == "abs":
55 result = np.absolute(result) 55 result = np.absolute(result)
56 else: 56 else:
57 raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") 57 raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
58 return (mask, result) 58 return (mask, result)
59 59
60 60
61 61
62 def disequilibrium_mean_by_cluster(mask, matrix): 62 def disequilibrium_mean_by_cluster(mask, matrix):
63 ''' 63 '''
64 Mean of disequilibrium 64 Mean of disequilibrium
65 matrix is the disequilibrium calculated 65 matrix is the disequilibrium calculated
66 from number of occurences belonging to a class, 66 from number of occurences belonging to a class,
67 for each cluster. 67 for each cluster.
68 ''' 68 '''
69 nb_k = len(matrix) 69 nb_k = len(matrix)
70 results = np.zeros((nb_k)) 70 results = np.zeros((nb_k))
71 71
72 for i in range(nb_k): 72 for i in range(nb_k):
73 results[i] = matrix[i].sum() / mask[i].sum() 73 results[i] = matrix[i].sum() / mask[i].sum()
74 return results 74 return results
75 75
76 76
77 def disequilibrium(matrix1, matrix2, isGlobal=False): 77 def disequilibrium(matrix1, matrix2, isGlobal=False):
78 ''' 78 '''
79 Disequilibrium matrix 79 Disequilibrium matrix
80 And Disequilibrium value 80 And Disequilibrium value
81 ''' 81 '''
82 mask, result = disequilibrium_(matrix1, matrix2, isGlobal) 82 mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
83 result_human = result * 100 83 result_human = result * 100
84 result_power = np.power(result, 2) 84 result_power = np.power(result, 2)
85 85
86 return ( 86 return (
87 mask, 87 mask,
88 result_human, 88 result_human,
89 disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0] 89 disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
90 ) 90 )
91 91
92 92
93 def compute_count_matrix(y_hat, y_truth): 93 def compute_count_matrix(y_truth, y_hat):
94 ''' 94 '''
95 Check the size of the lists with assertion 95 Check the size of the lists with assertion
96 ''' 96 '''
97 # Check size of the lists 97 # Check size of the lists
98 assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}" 98 assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
99 99
100 # Build count matrix 100 # Build count matrix
101 count_matrix = np.zeros((max(y_hat+1), max(y_truth+1))) 101 count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
102 for i in range(len(y_hat)): 102 for i in range(len(y_hat)):
103 count_matrix[y_hat[i]][y_truth[i]] += 1 103 count_matrix[y_hat[i]][y_truth[i]] += 1
104 return count_matrix 104 return count_matrix
105 105
106 106
107 def entropy_score(y_truth, y_hat): 107 def entropy_score(y_truth, y_hat):
108 ''' 108 '''
109 Need to use label encoder before givin y_hat and y_truth 109 Need to use label encoder before givin y_hat and y_truth
110 Don't use one hot labels 110 Don't use one hot labels
111 111
112 Return a tuple with: 112 Return a tuple with:
113 - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x))) 113 - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
114 - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster. 114 - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
115 - result : the final entropy measure of the clustering 115 - result : the final entropy measure of the clustering
116 ''' 116 '''
117 def divide_line(a, divider): 117 def divide_line(a, divider):
118 ''' 118 '''
119 Sub function used for dividing matrix by a vector line by line. 119 Sub function used for dividing matrix by a vector line by line.
120 ''' 120 '''
121 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) 121 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
122 122
123 # Build count matrix 123 # Build count matrix
124 count_matrix = compute_count_matrix(y_hat, y_truth) 124 count_matrix = compute_count_matrix(y_truth, y_hat)
125 125
126 # Build dividers vector 126 # Build dividers vector
127 dividers = count_matrix.sum(axis=1) 127 dividers = count_matrix.sum(axis=1)
128 128
129 matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) 129 matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
130 130
131 log_matrix = np.zeros(matrix_divided.shape) 131 log_matrix = np.zeros(matrix_divided.shape)
132 np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0) 132 np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
133 result_matrix = -1 * np.multiply(matrix_divided, log_matrix) 133 result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
134 result_vector = result_matrix.sum(axis=1) 134 result_vector = result_matrix.sum(axis=1)
135 result_vector.sum() 135 result_vector.sum()
136 136
137 if np.isnan(np.sum(result_vector)): 137 if np.isnan(np.sum(result_vector)):
138 print("COUNT MATRIX") 138 print("COUNT MATRIX")
139 print(count_matrix) 139 print(count_matrix)
140 print("MATRIX DIVIDED") 140 print("MATRIX DIVIDED")
141 print(matrix_divided) 141 print(matrix_divided)
142 print("RESULT MATRIX") 142 print("RESULT MATRIX")
143 print(result_matrix) 143 print(result_matrix)
144 print("VECTOR MATRIX") 144 print("VECTOR MATRIX")
145 print(result_vector) 145 print(result_vector)
146 print("An error occured due to nan value, some values are printed before") 146 print("An error occured due to nan value, some values are printed before")
147 exit(1) 147 exit(1)
148 148
149 result = result_vector * dividers / dividers.sum() 149 result = result_vector * dividers / dividers.sum()
150 result = result.sum() 150 result = result.sum()
151 return (result_matrix, result_vector, result) 151 return (result_matrix, result_vector, result)
152 152
153 153
154 def purity_score(y_truth, y_hat):
154 155
156 def divide_line(a, divider):
157 '''
158 Sub function used for dividing matrix by a vector line by line.
159 '''
160 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
161
162 def compute_purity_score(count_matrix, axis=0):
163 count_per_row = count_matrix.sum(axis=axis)
164 dividers = np.square(count_per_row)
165 count_matrix_squared = np.square(count_matrix)
166 matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix_squared, dtype=np.float), dividers)
167 vector_purity = np.sum(matrix_divided, axis=axis)
168
169 scalar_purity = np.average(vector_purity, weights=count_per_row)
170 return (vector_purity, scalar_purity)
171
172
173 count_matrix = compute_count_matrix(y_truth, y_hat)
174 _, purity_cluster_score = compute_purity_score(count_matrix, 1)
175 _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
176
177 K = np.sqrt(purity_cluster_score * purity_class_score)
178
179 for i in range(count_matrix.shape[0]):
180
181 for j in range(count_matrix.shape[1]):
182 count_matrix[i][j]
183 count_matrix[i]
184 return {
185 "purity_class_score": purity_class_score,
186 "purity_cluster_score": purity_cluster_score,
187 "K": K
188 }
189
190
155 if __name__ == "__main__": 191 if __name__ == "__main__":
156 # Hypothesis 192 # Hypothesis
157 y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0]) 193 y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
158 # Truth 194 # Truth
159 y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]) 195 y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
160 196
161 (result_matrix, result_vector, result) = entropy(y, y_hat) 197 (result_matrix, result_vector, result) = entropy_score(y, y_hat)
162 198
199
200 print(purity_score(y, y_hat))
201 exit(1)
163 print("Result matrix: ") 202 print("Result matrix: ")
164 print(result_matrix) 203 print(result_matrix)
165 print("Result vector: ") 204 print("Result vector: ")
166 print(result_vector) 205 print(result_vector)
167 print("Result: ", result) 206 print("Result: ", result)