Commit 0b307187826ee27ddd5368d2cd2f81671d5fd59a
1 parent
933b2505a1
Exists in
master
normalize the global value of disequilibrium by the number of clusters
Showing 1 changed file with 2 additions and 2 deletions Inline Diff
bin/measures.py
1 | ''' | 1 | ''' |
2 | This module is a part of my library. | 2 | This module is a part of my library. |
3 | It aims to compute some measures for clustering. | 3 | It aims to compute some measures for clustering. |
4 | ''' | 4 | ''' |
5 | 5 | ||
6 | import numpy as np | 6 | import numpy as np |
7 | 7 | ||
8 | def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): | 8 | def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): |
9 | ''' | 9 | ''' |
10 | Compute disequilibrium for all the clusters. | 10 | Compute disequilibrium for all the clusters. |
11 | The disequilibrium is compute from the difference | 11 | The disequilibrium is compute from the difference |
12 | between two clustering sets. | 12 | between two clustering sets. |
13 | isGlobal permet à l'utilisateur de choisir le dénominateur de | 13 | isGlobal permet à l'utilisateur de choisir le dénominateur de |
14 | la fonction : | 14 | la fonction : |
15 | - True : divise la valeur par le nombre d'élément du cluster | 15 | - True : divise la valeur par le nombre d'élément du cluster |
16 | - False : divise la valeur par le nombre d'élément total | 16 | - False : divise la valeur par le nombre d'élément total |
17 | 17 | ||
18 | withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou | 18 | withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou |
19 | une valeur absolue. | 19 | une valeur absolue. |
20 | ''' | 20 | ''' |
21 | 21 | ||
22 | def divide_line(a, divider): | 22 | def divide_line(a, divider): |
23 | ''' | 23 | ''' |
24 | Sub function used for dividing matrix by a vector line by line. | 24 | Sub function used for dividing matrix by a vector line by line. |
25 | ''' | 25 | ''' |
26 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | 26 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) |
27 | 27 | ||
28 | dividers1 = 0 | 28 | dividers1 = 0 |
29 | dividers2 = 0 | 29 | dividers2 = 0 |
30 | 30 | ||
31 | if isGlobal: | 31 | if isGlobal: |
32 | dividers1 = matrix1.sum() | 32 | dividers1 = matrix1.sum() |
33 | dividers2 = matrix2.sum() | 33 | dividers2 = matrix2.sum() |
34 | else: | 34 | else: |
35 | dividers1 = matrix1.sum(axis=1) | 35 | dividers1 = matrix1.sum(axis=1) |
36 | dividers2 = matrix2.sum(axis=1) | 36 | dividers2 = matrix2.sum(axis=1) |
37 | 37 | ||
38 | matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) | 38 | matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) |
39 | 39 | ||
40 | matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) | 40 | matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) |
41 | 41 | ||
42 | diff = matrix1_divided - matrix2_divided | 42 | diff = matrix1_divided - matrix2_divided |
43 | 43 | ||
44 | mask = (matrix2==0) & (matrix1==0) | 44 | mask = (matrix2==0) & (matrix1==0) |
45 | result = diff | 45 | result = diff |
46 | 46 | ||
47 | if mod != None or mod == "": | 47 | if mod != None or mod == "": |
48 | for word in mod.split(" "): | 48 | for word in mod.split(" "): |
49 | if word == "power": | 49 | if word == "power": |
50 | result = np.power(result,2) | 50 | result = np.power(result,2) |
51 | elif word == "human": | 51 | elif word == "human": |
52 | result = result * 100 | 52 | result = result * 100 |
53 | elif word == "abs": | 53 | elif word == "abs": |
54 | result = np.absolute(result) | 54 | result = np.absolute(result) |
55 | else: | 55 | else: |
56 | raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") | 56 | raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") |
57 | return (mask, result) | 57 | return (mask, result) |
58 | 58 | ||
59 | 59 | ||
60 | 60 | ||
61 | def disequilibrium_mean_by_cluster(mask, matrix): | 61 | def disequilibrium_mean_by_cluster(mask, matrix): |
62 | ''' | 62 | ''' |
63 | Mean of disequilibrium | 63 | Mean of disequilibrium |
64 | matrix is the disequilibrium calculated | 64 | matrix is the disequilibrium calculated |
65 | from number of occurences belonging to a class, | 65 | from number of occurences belonging to a class, |
66 | for each cluster. | 66 | for each cluster. |
67 | ''' | 67 | ''' |
68 | nb_k = len(matrix) | 68 | nb_k = len(matrix) |
69 | results = np.zeros((nb_k)) | 69 | results = np.zeros((nb_k)) |
70 | for i in range(nb_k): | 70 | for i in range(nb_k): |
71 | results[i] = matrix[i].sum() / mask[i].sum() | 71 | results[i] = matrix[i].sum() / mask[i].sum() |
72 | return results | 72 | return results |
73 | 73 | ||
74 | 74 | ||
75 | def disequilibrium(matrix1, matrix2, isGlobal=False): | 75 | def disequilibrium(matrix1, matrix2, isGlobal=False): |
76 | ''' | 76 | ''' |
77 | Disequilibrium matrix | 77 | Disequilibrium matrix |
78 | And Disequilibrium value | 78 | And Disequilibrium value |
79 | ''' | 79 | ''' |
80 | mask, result = disequilibrium_(matrix1, matrix2, isGlobal) | 80 | mask, result = disequilibrium_(matrix1, matrix2, isGlobal) |
81 | result_human = result * 100 | 81 | result_human = result * 100 |
82 | result_power = np.power(result, 2) | 82 | result_power = np.power(result, 2) |
83 | 83 | ||
84 | return ( | 84 | return ( |
85 | mask, | 85 | mask, |
86 | result_human, | 86 | result_human, |
87 | disequilibrium_mean_by_cluster(mask, result_power).sum() | 87 | disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0] |
88 | ) | 88 | ) |
89 | 89 | ||
90 | 90 | ||
91 | def entropy(count_matrix): | 91 | def entropy(count_matrix): |
92 | def divide_line(a, divider): | 92 | def divide_line(a, divider): |
93 | ''' | 93 | ''' |
94 | Sub function used for dividing matrix by a vector line by line. | 94 | Sub function used for dividing matrix by a vector line by line. |
95 | ''' | 95 | ''' |
96 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | 96 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) |
97 | 97 | ||
98 | dividers = count_matrix.sum(axis=1) | 98 | dividers = count_matrix.sum(axis=1) |
99 | 99 | ||
100 | matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) | 100 | matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) |
101 | 101 | ||
102 | result_matrix = -1 * matrix_divided * np.log2(matrix_divided, where=count_matrix != 0) | 102 | result_matrix = -1 * matrix_divided * np.log2(matrix_divided, where=count_matrix != 0) |
103 | result = result_matrix.sum(axis=1) * dividers / dividers.sum() | 103 | result = result_matrix.sum(axis=1) * dividers / dividers.sum() |
104 | result = result.sum() | 104 | result = result.sum() |
105 | return (result_matrix, result) | 105 | return (result_matrix, result) |
106 |