Commit 1f8612ebfd7fe8173f5e7f5374192182a1064da3

Authored by Mathias
1 parent adbca3b1ce
Exists in master

repaired memory error due to np.log2 behaviour

Showing 1 changed file with 167 additions and 0 deletions Inline Diff

File was created 1 '''
2 This module is a part of my library.
3 It aims to compute some measures for clustering.
4 '''
5
6 import numpy as np
7
8 def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
9 '''
10 Compute disequilibrium for all the clusters.
11 The disequilibrium is compute from the difference
12 between two clustering sets.
13 isGlobal permet à l'utilisateur de choisir le dénominateur de
14 la fonction :
15 - True : divise la valeur par le nombre d'élément du cluster
16 - False : divise la valeur par le nombre d'élément total
17
18 withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
19 une valeur absolue.
20 '''
21
22 def divide_line(a, divider):
23 '''
24 Sub function used for dividing matrix by a vector line by line.
25 '''
26 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
27
28 dividers1 = 0
29 dividers2 = 0
30
31 if isGlobal:
32 dividers1 = matrix1.sum()
33 dividers2 = matrix2.sum()
34 else:
35 dividers1 = matrix1.sum(axis=1)
36 dividers2 = matrix2.sum(axis=1)
37
38 matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
39
40 matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
41
42 diff = matrix1_divided - matrix2_divided
43
44 mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
45
46 result = diff
47
48 if mod != None or mod == "":
49 for word in mod.split(" "):
50 if word == "power":
51 result = np.power(result,2)
52 elif word == "human":
53 result = result * 100
54 elif word == "abs":
55 result = np.absolute(result)
56 else:
57 raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
58 return (mask, result)
59
60
61
62 def disequilibrium_mean_by_cluster(mask, matrix):
63 '''
64 Mean of disequilibrium
65 matrix is the disequilibrium calculated
66 from number of occurences belonging to a class,
67 for each cluster.
68 '''
69 nb_k = len(matrix)
70 results = np.zeros((nb_k))
71
72 for i in range(nb_k):
73 results[i] = matrix[i].sum() / mask[i].sum()
74 return results
75
76
77 def disequilibrium(matrix1, matrix2, isGlobal=False):
78 '''
79 Disequilibrium matrix
80 And Disequilibrium value
81 '''
82 mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
83 result_human = result * 100
84 result_power = np.power(result, 2)
85
86 return (
87 mask,
88 result_human,
89 disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
90 )
91
92
93 def compute_count_matrix(y_hat, y_truth):
94 '''
95 Check the size of the lists with assertion
96 '''
97 # Check size of the lists
98 assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
99
100 # Build count matrix
101 count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
102 for i in range(len(y_hat)):
103 count_matrix[y_hat[i]][y_truth[i]] += 1
104 return count_matrix
105
106
107 def entropy_score(y_truth, y_hat):
108 '''
109 Need to use label encoder before givin y_hat and y_truth
110 Don't use one hot labels
111
112 Return a tuple with:
113 - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
114 - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
115 - result : the final entropy measure of the clustering
116 '''
117 def divide_line(a, divider):
118 '''
119 Sub function used for dividing matrix by a vector line by line.
120 '''
121 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
122
123 # Build count matrix
124 count_matrix = compute_count_matrix(y_hat, y_truth)
125
126 # Build dividers vector
127 dividers = count_matrix.sum(axis=1)
128
129 matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
130
131 log_matrix = np.zeros(matrix_divided.shape)
132 np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
133 result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
134 result_vector = result_matrix.sum(axis=1)
135 result_vector.sum()
136
137 if np.isnan(np.sum(result_vector)):
138 print("COUNT MATRIX")
139 print(count_matrix)
140 print("MATRIX DIVIDED")
141 print(matrix_divided)
142 print("RESULT MATRIX")
143 print(result_matrix)
144 print("VECTOR MATRIX")
145 print(result_vector)
146 print("An error occured due to nan value, some values are printed before")
147 exit(1)
148
149 result = result_vector * dividers / dividers.sum()
150 result = result.sum()
151 return (result_matrix, result_vector, result)
152
153
154
155 if __name__ == "__main__":
156 # Hypothesis
157 y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
158 # Truth
159 y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
160
161 (result_matrix, result_vector, result) = entropy(y, y_hat)
162
163 print("Result matrix: ")
164 print(result_matrix)
165 print("Result vector: ")
166 print(result_vector)
167 print("Result: ", result)