Commit 0b307187826ee27ddd5368d2cd2f81671d5fd59a

Authored by Mathias Quillot
1 parent 933b2505a1
Exists in master

normalize the global value of disequilibrium by the number of clusters

Showing 1 changed file with 2 additions and 2 deletions Inline Diff

1 ''' 1 '''
2 This module is a part of my library. 2 This module is a part of my library.
3 It aims to compute some measures for clustering. 3 It aims to compute some measures for clustering.
4 ''' 4 '''
5 5
6 import numpy as np 6 import numpy as np
7 7
8 def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): 8 def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
9 ''' 9 '''
10 Compute disequilibrium for all the clusters. 10 Compute disequilibrium for all the clusters.
11 The disequilibrium is compute from the difference 11 The disequilibrium is compute from the difference
12 between two clustering sets. 12 between two clustering sets.
13 isGlobal permet à l'utilisateur de choisir le dénominateur de 13 isGlobal permet à l'utilisateur de choisir le dénominateur de
14 la fonction : 14 la fonction :
15 - True : divise la valeur par le nombre d'élément du cluster 15 - True : divise la valeur par le nombre d'élément du cluster
16 - False : divise la valeur par le nombre d'élément total 16 - False : divise la valeur par le nombre d'élément total
17 17
18 withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou 18 withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
19 une valeur absolue. 19 une valeur absolue.
20 ''' 20 '''
21 21
22 def divide_line(a, divider): 22 def divide_line(a, divider):
23 ''' 23 '''
24 Sub function used for dividing matrix by a vector line by line. 24 Sub function used for dividing matrix by a vector line by line.
25 ''' 25 '''
26 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) 26 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
27 27
28 dividers1 = 0 28 dividers1 = 0
29 dividers2 = 0 29 dividers2 = 0
30 30
31 if isGlobal: 31 if isGlobal:
32 dividers1 = matrix1.sum() 32 dividers1 = matrix1.sum()
33 dividers2 = matrix2.sum() 33 dividers2 = matrix2.sum()
34 else: 34 else:
35 dividers1 = matrix1.sum(axis=1) 35 dividers1 = matrix1.sum(axis=1)
36 dividers2 = matrix2.sum(axis=1) 36 dividers2 = matrix2.sum(axis=1)
37 37
38 matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) 38 matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
39 39
40 matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) 40 matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
41 41
42 diff = matrix1_divided - matrix2_divided 42 diff = matrix1_divided - matrix2_divided
43 43
44 mask = (matrix2==0) & (matrix1==0) 44 mask = (matrix2==0) & (matrix1==0)
45 result = diff 45 result = diff
46 46
47 if mod != None or mod == "": 47 if mod != None or mod == "":
48 for word in mod.split(" "): 48 for word in mod.split(" "):
49 if word == "power": 49 if word == "power":
50 result = np.power(result,2) 50 result = np.power(result,2)
51 elif word == "human": 51 elif word == "human":
52 result = result * 100 52 result = result * 100
53 elif word == "abs": 53 elif word == "abs":
54 result = np.absolute(result) 54 result = np.absolute(result)
55 else: 55 else:
56 raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") 56 raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
57 return (mask, result) 57 return (mask, result)
58 58
59 59
60 60
61 def disequilibrium_mean_by_cluster(mask, matrix): 61 def disequilibrium_mean_by_cluster(mask, matrix):
62 ''' 62 '''
63 Mean of disequilibrium 63 Mean of disequilibrium
64 matrix is the disequilibrium calculated 64 matrix is the disequilibrium calculated
65 from number of occurences belonging to a class, 65 from number of occurences belonging to a class,
66 for each cluster. 66 for each cluster.
67 ''' 67 '''
68 nb_k = len(matrix) 68 nb_k = len(matrix)
69 results = np.zeros((nb_k)) 69 results = np.zeros((nb_k))
70 for i in range(nb_k): 70 for i in range(nb_k):
71 results[i] = matrix[i].sum() / mask[i].sum() 71 results[i] = matrix[i].sum() / mask[i].sum()
72 return results 72 return results
73 73
74 74
75 def disequilibrium(matrix1, matrix2, isGlobal=False): 75 def disequilibrium(matrix1, matrix2, isGlobal=False):
76 ''' 76 '''
77 Disequilibrium matrix 77 Disequilibrium matrix
78 And Disequilibrium value 78 And Disequilibrium value
79 ''' 79 '''
80 mask, result = disequilibrium_(matrix1, matrix2, isGlobal) 80 mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
81 result_human = result * 100 81 result_human = result * 100
82 result_power = np.power(result, 2) 82 result_power = np.power(result, 2)
83 83
84 return ( 84 return (
85 mask, 85 mask,
86 result_human, 86 result_human,
87 disequilibrium_mean_by_cluster(mask, result_power).sum() 87 disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
88 ) 88 )
89 89
90 90
91 def entropy(count_matrix): 91 def entropy(count_matrix):
92 def divide_line(a, divider): 92 def divide_line(a, divider):
93 ''' 93 '''
94 Sub function used for dividing matrix by a vector line by line. 94 Sub function used for dividing matrix by a vector line by line.
95 ''' 95 '''
96 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) 96 return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
97 97
98 dividers = count_matrix.sum(axis=1) 98 dividers = count_matrix.sum(axis=1)
99 99
100 matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) 100 matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
101 101
102 result_matrix = -1 * matrix_divided * np.log2(matrix_divided, where=count_matrix != 0) 102 result_matrix = -1 * matrix_divided * np.log2(matrix_divided, where=count_matrix != 0)
103 result = result_matrix.sum(axis=1) * dividers / dividers.sum() 103 result = result_matrix.sum(axis=1) * dividers / dividers.sum()
104 result = result.sum() 104 result = result.sum()
105 return (result_matrix, result) 105 return (result_matrix, result)
106