Commit 1f8612ebfd7fe8173f5e7f5374192182a1064da3
1 parent
adbca3b1ce
Exists in
master
repaired memory error due to np.log2 behaviour
Showing 1 changed file with 167 additions and 0 deletions Inline Diff
volia/measures.py
File was created | 1 | ''' | |
2 | This module is a part of my library. | ||
3 | It aims to compute some measures for clustering. | ||
4 | ''' | ||
5 | |||
6 | import numpy as np | ||
7 | |||
8 | def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): | ||
9 | ''' | ||
10 | Compute disequilibrium for all the clusters. | ||
11 | The disequilibrium is compute from the difference | ||
12 | between two clustering sets. | ||
13 | isGlobal permet à l'utilisateur de choisir le dénominateur de | ||
14 | la fonction : | ||
15 | - True : divise la valeur par le nombre d'élément du cluster | ||
16 | - False : divise la valeur par le nombre d'élément total | ||
17 | |||
18 | withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou | ||
19 | une valeur absolue. | ||
20 | ''' | ||
21 | |||
22 | def divide_line(a, divider): | ||
23 | ''' | ||
24 | Sub function used for dividing matrix by a vector line by line. | ||
25 | ''' | ||
26 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | ||
27 | |||
28 | dividers1 = 0 | ||
29 | dividers2 = 0 | ||
30 | |||
31 | if isGlobal: | ||
32 | dividers1 = matrix1.sum() | ||
33 | dividers2 = matrix2.sum() | ||
34 | else: | ||
35 | dividers1 = matrix1.sum(axis=1) | ||
36 | dividers2 = matrix2.sum(axis=1) | ||
37 | |||
38 | matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) | ||
39 | |||
40 | matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) | ||
41 | |||
42 | diff = matrix1_divided - matrix2_divided | ||
43 | |||
44 | mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0)) | ||
45 | |||
46 | result = diff | ||
47 | |||
48 | if mod != None or mod == "": | ||
49 | for word in mod.split(" "): | ||
50 | if word == "power": | ||
51 | result = np.power(result,2) | ||
52 | elif word == "human": | ||
53 | result = result * 100 | ||
54 | elif word == "abs": | ||
55 | result = np.absolute(result) | ||
56 | else: | ||
57 | raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") | ||
58 | return (mask, result) | ||
59 | |||
60 | |||
61 | |||
62 | def disequilibrium_mean_by_cluster(mask, matrix): | ||
63 | ''' | ||
64 | Mean of disequilibrium | ||
65 | matrix is the disequilibrium calculated | ||
66 | from number of occurences belonging to a class, | ||
67 | for each cluster. | ||
68 | ''' | ||
69 | nb_k = len(matrix) | ||
70 | results = np.zeros((nb_k)) | ||
71 | |||
72 | for i in range(nb_k): | ||
73 | results[i] = matrix[i].sum() / mask[i].sum() | ||
74 | return results | ||
75 | |||
76 | |||
77 | def disequilibrium(matrix1, matrix2, isGlobal=False): | ||
78 | ''' | ||
79 | Disequilibrium matrix | ||
80 | And Disequilibrium value | ||
81 | ''' | ||
82 | mask, result = disequilibrium_(matrix1, matrix2, isGlobal) | ||
83 | result_human = result * 100 | ||
84 | result_power = np.power(result, 2) | ||
85 | |||
86 | return ( | ||
87 | mask, | ||
88 | result_human, | ||
89 | disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0] | ||
90 | ) | ||
91 | |||
92 | |||
93 | def compute_count_matrix(y_hat, y_truth): | ||
94 | ''' | ||
95 | Check the size of the lists with assertion | ||
96 | ''' | ||
97 | # Check size of the lists | ||
98 | assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}" | ||
99 | |||
100 | # Build count matrix | ||
101 | count_matrix = np.zeros((max(y_hat+1), max(y_truth+1))) | ||
102 | for i in range(len(y_hat)): | ||
103 | count_matrix[y_hat[i]][y_truth[i]] += 1 | ||
104 | return count_matrix | ||
105 | |||
106 | |||
107 | def entropy_score(y_truth, y_hat): | ||
108 | ''' | ||
109 | Need to use label encoder before givin y_hat and y_truth | ||
110 | Don't use one hot labels | ||
111 | |||
112 | Return a tuple with: | ||
113 | - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x))) | ||
114 | - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster. | ||
115 | - result : the final entropy measure of the clustering | ||
116 | ''' | ||
117 | def divide_line(a, divider): | ||
118 | ''' | ||
119 | Sub function used for dividing matrix by a vector line by line. | ||
120 | ''' | ||
121 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | ||
122 | |||
123 | # Build count matrix | ||
124 | count_matrix = compute_count_matrix(y_hat, y_truth) | ||
125 | |||
126 | # Build dividers vector | ||
127 | dividers = count_matrix.sum(axis=1) | ||
128 | |||
129 | matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) | ||
130 | |||
131 | log_matrix = np.zeros(matrix_divided.shape) | ||
132 | np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0) | ||
133 | result_matrix = -1 * np.multiply(matrix_divided, log_matrix) | ||
134 | result_vector = result_matrix.sum(axis=1) | ||
135 | result_vector.sum() | ||
136 | |||
137 | if np.isnan(np.sum(result_vector)): | ||
138 | print("COUNT MATRIX") | ||
139 | print(count_matrix) | ||
140 | print("MATRIX DIVIDED") | ||
141 | print(matrix_divided) | ||
142 | print("RESULT MATRIX") | ||
143 | print(result_matrix) | ||
144 | print("VECTOR MATRIX") | ||
145 | print(result_vector) | ||
146 | print("An error occured due to nan value, some values are printed before") | ||
147 | exit(1) | ||
148 | |||
149 | result = result_vector * dividers / dividers.sum() | ||
150 | result = result.sum() | ||
151 | return (result_matrix, result_vector, result) | ||
152 | |||
153 | |||
154 | |||
155 | if __name__ == "__main__": | ||
156 | # Hypothesis | ||
157 | y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0]) | ||
158 | # Truth | ||
159 | y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]) | ||
160 | |||
161 | (result_matrix, result_vector, result) = entropy(y, y_hat) | ||
162 | |||
163 | print("Result matrix: ") | ||
164 | print(result_matrix) | ||
165 | print("Result vector: ") | ||
166 | print(result_vector) | ||
167 | print("Result: ", result) |