Quillot Mathias / volia

1

'''

1

'''

2

This module is a part of my library.

2

This module is a part of my library.

3

It aims to compute some measures for clustering.

3

It aims to compute some measures for clustering.

4

'''

4

'''

5

6

import numpy as np

6

import numpy as np

7

8

def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):

8

def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):

9

'''

9

'''

10

Compute disequilibrium for all the clusters.

10

Compute disequilibrium for all the clusters.

11

The disequilibrium is compute from the difference

11

The disequilibrium is compute from the difference

12

between two clustering sets.

12

between two clustering sets.

13

isGlobal permet à l'utilisateur de choisir le dénominateur de

13

isGlobal permet à l'utilisateur de choisir le dénominateur de

14

la fonction :

14

la fonction :

15

- True : divise la valeur par le nombre d'élément du cluster

15

- True : divise la valeur par le nombre d'élément du cluster

16

- False : divise la valeur par le nombre d'élément total

16

- False : divise la valeur par le nombre d'élément total

17

18

withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou

18

withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou

19

une valeur absolue.

19

une valeur absolue.

20

'''

20

'''

21

22

def divide_line(a, divider):

22

def divide_line(a, divider):

23

'''

23

'''

24

Sub function used for dividing matrix by a vector line by line.

24

Sub function used for dividing matrix by a vector line by line.

25

'''

25

'''

26

return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)

26

return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)

27

28

dividers1 = 0

28

dividers1 = 0

29

dividers2 = 0

29

dividers2 = 0

30

31

if isGlobal:

31

if isGlobal:

32

dividers1 = matrix1.sum()

32

dividers1 = matrix1.sum()

33

dividers2 = matrix2.sum()

33

dividers2 = matrix2.sum()

34

else:

34

else:

35

dividers1 = matrix1.sum(axis=1)

35

dividers1 = matrix1.sum(axis=1)

36

dividers2 = matrix2.sum(axis=1)

36

dividers2 = matrix2.sum(axis=1)

37

38

matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)

38

matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)

39

40

matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)

40

matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)

41

42

diff = matrix1_divided - matrix2_divided

42

diff = matrix1_divided - matrix2_divided

43

44

mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))

44

mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))

45

46

result = diff

46

result = diff

47

48

if mod != None or mod == "":

48

if mod != None or mod == "":

49

for word in mod.split(" "):

49

for word in mod.split(" "):

50

if word == "power":

50

if word == "power":

51

result = np.power(result,2)

51

result = np.power(result,2)

52

elif word == "human":

52

elif word == "human":

53

result = result * 100

53

result = result * 100

54

elif word == "abs":

54

elif word == "abs":

55

result = np.absolute(result)

55

result = np.absolute(result)

56

else:

56

else:

57

raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")

57

raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")

58

return (mask, result)

58

return (mask, result)

59

60

61

62

def disequilibrium_mean_by_cluster(mask, matrix):

62

def disequilibrium_mean_by_cluster(mask, matrix):

63

'''

63

'''

64

Mean of disequilibrium

64

Mean of disequilibrium

65

matrix is the disequilibrium calculated

65

matrix is the disequilibrium calculated

66

from number of occurences belonging to a class,

66

from number of occurences belonging to a class,

67

for each cluster.

67

for each cluster.

68

'''

68

'''

69

nb_k = len(matrix)

69

nb_k = len(matrix)

70

results = np.zeros((nb_k))

70

results = np.zeros((nb_k))

71

72

for i in range(nb_k):

72

for i in range(nb_k):

73

results[i] = matrix[i].sum() / mask[i].sum()

73

results[i] = matrix[i].sum() / mask[i].sum()

74

return results

74

return results

75

76

77

def disequilibrium(matrix1, matrix2, isGlobal=False):

77

def disequilibrium(matrix1, matrix2, isGlobal=False):

78

'''

78

'''

79

Disequilibrium matrix

79

Disequilibrium matrix

80

And Disequilibrium value

80

And Disequilibrium value

81

'''

81

'''

82

mask, result = disequilibrium_(matrix1, matrix2, isGlobal)

82

mask, result = disequilibrium_(matrix1, matrix2, isGlobal)

83

result_human = result * 100

83

result_human = result * 100

84

result_power = np.power(result, 2)

84

result_power = np.power(result, 2)

85

86

return (

86

return (

87

mask,

87

mask,

88

result_human,

88

result_human,

89

disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]

89

disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]

90

)

90

)

91

92

93

def compute_count_matrix(y_hat, y_truth):

93

def compute_count_matrix(y_truth, y_hat):

94

'''

94

'''

95

Check the size of the lists with assertion

95

Check the size of the lists with assertion

96

'''

96

'''

97

# Check size of the lists

97

# Check size of the lists

98

assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"

98

assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"

99

100

# Build count matrix

100

# Build count matrix

101

count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))

101

count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))

102

for i in range(len(y_hat)):

102

for i in range(len(y_hat)):

103

count_matrix[y_hat[i]][y_truth[i]] += 1

103

count_matrix[y_hat[i]][y_truth[i]] += 1

104

return count_matrix

104

return count_matrix

105

106

107

def entropy_score(y_truth, y_hat):

107

def entropy_score(y_truth, y_hat):

108

'''

108

'''

109

Need to use label encoder before givin y_hat and y_truth

109

Need to use label encoder before givin y_hat and y_truth

110

Don't use one hot labels

110

Don't use one hot labels

111

112

Return a tuple with:

112

Return a tuple with:

113

- result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))

113

- result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))

114

- result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.

114

- result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.

115

- result : the final entropy measure of the clustering

115

- result : the final entropy measure of the clustering

116

'''

116

'''

117

def divide_line(a, divider):

117

def divide_line(a, divider):

118

'''

118

'''

119

Sub function used for dividing matrix by a vector line by line.

119

Sub function used for dividing matrix by a vector line by line.

120

'''

120

'''

121

return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)

121

return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)

122

123

# Build count matrix

123

# Build count matrix

124

count_matrix = compute_count_matrix(y_hat, y_truth)

124

count_matrix = compute_count_matrix(y_truth, y_hat)

125

126

# Build dividers vector

126

# Build dividers vector

127

dividers = count_matrix.sum(axis=1)

127

dividers = count_matrix.sum(axis=1)

128

129

matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)

129

matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)

130

131

log_matrix = np.zeros(matrix_divided.shape)

131

log_matrix = np.zeros(matrix_divided.shape)

132

np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)

132

np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)

133

result_matrix = -1 * np.multiply(matrix_divided, log_matrix)

133

result_matrix = -1 * np.multiply(matrix_divided, log_matrix)

134

result_vector = result_matrix.sum(axis=1)

134

result_vector = result_matrix.sum(axis=1)

135

result_vector.sum()

135

result_vector.sum()

136

137

if np.isnan(np.sum(result_vector)):

137

if np.isnan(np.sum(result_vector)):

138

print("COUNT MATRIX")

138

print("COUNT MATRIX")

139

print(count_matrix)

139

print(count_matrix)

140

print("MATRIX DIVIDED")

140

print("MATRIX DIVIDED")

141

print(matrix_divided)

141

print(matrix_divided)

142

print("RESULT MATRIX")

142

print("RESULT MATRIX")

143

print(result_matrix)

143

print(result_matrix)

144

print("VECTOR MATRIX")

144

print("VECTOR MATRIX")

145

print(result_vector)

145

print(result_vector)

146

print("An error occured due to nan value, some values are printed before")

146

print("An error occured due to nan value, some values are printed before")

147

exit(1)

147

exit(1)

148

149

result = result_vector * dividers / dividers.sum()

149

result = result_vector * dividers / dividers.sum()

150

result = result.sum()

150

result = result.sum()

151

return (result_matrix, result_vector, result)

151

return (result_matrix, result_vector, result)

152

153

154

def purity_score(y_truth, y_hat):

154

155

156

def divide_line(a, divider):

157

'''

158

Sub function used for dividing matrix by a vector line by line.

159

'''

160

return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)

161

162

def compute_purity_score(count_matrix, axis=0):

163

count_per_row = count_matrix.sum(axis=axis)

164

dividers = np.square(count_per_row)

165

count_matrix_squared = np.square(count_matrix)

166

matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix_squared, dtype=np.float), dividers)

167

vector_purity = np.sum(matrix_divided, axis=axis)

168

169

scalar_purity = np.average(vector_purity, weights=count_per_row)

170

return (vector_purity, scalar_purity)

171

172

173

count_matrix = compute_count_matrix(y_truth, y_hat)

174

_, purity_cluster_score = compute_purity_score(count_matrix, 1)

175

_, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)

176

177

K = np.sqrt(purity_cluster_score * purity_class_score)

178

179

for i in range(count_matrix.shape[0]):

180

181

for j in range(count_matrix.shape[1]):

182

count_matrix[i][j]

183

count_matrix[i]

184

return {

185

"purity_class_score": purity_class_score,

186

"purity_cluster_score": purity_cluster_score,

187

"K": K

188

}

189

190

155

if __name__ == "__main__":

191

if __name__ == "__main__":

156

# Hypothesis

192

# Hypothesis

157

y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])

193

y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])

158

# Truth

194

# Truth

159

y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])

195

y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])

160

196

161

(result_matrix, result_vector, result) = entropy(y, y_hat)

197

(result_matrix, result_vector, result) = entropy_score(y, y_hat)

162

198

199

200

print(purity_score(y, y_hat))

201

exit(1)

163

print("Result matrix: ")

202

print("Result matrix: ")

164

print(result_matrix)

203

print(result_matrix)

165

print("Result vector: ")

204

print("Result vector: ")

166

print(result_vector)

205

print(result_vector)

167

print("Result: ", result)

206

print("Result: ", result)

GITLAB

Quillot Mathias / volia

purity measure added and tested

 '''
 This module is a part of my library.
 It aims to compute some measures for clustering.
 '''
 import numpy as np
 def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
     '''
     Compute disequilibrium for all the clusters.
     The disequilibrium is compute from the difference
     between two clustering sets.
     isGlobal permet à l'utilisateur de choisir le dénominateur de
     la fonction :
         - True : divise la valeur par le nombre d'élément du cluster
         - False : divise la valeur par le nombre d'élément total
     withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
     une valeur absolue.
     '''
     def divide_line(a, divider):
         '''
         Sub function used for dividing matrix by a vector line by line.
         '''
         return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
     dividers1 = 0
     dividers2 = 0
     if isGlobal:
         dividers1 = matrix1.sum()
         dividers2 = matrix2.sum()
     else:
         dividers1 = matrix1.sum(axis=1)
         dividers2 = matrix2.sum(axis=1)
     matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
     matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
     diff = matrix1_divided - matrix2_divided
     mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
     result = diff
     if mod != None or mod == "":
         for word in mod.split(" "):
             if word == "power":
                 result = np.power(result,2)
             elif word == "human":
                 result = result * 100
             elif word == "abs":
                 result = np.absolute(result)
             else:
                 raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
     return (mask, result)
 def disequilibrium_mean_by_cluster(mask, matrix):
     '''
     Mean of disequilibrium
     matrix is the disequilibrium calculated
     from number of occurences belonging to a class,
     for each cluster.
     '''
     nb_k = len(matrix)
     results = np.zeros((nb_k))
     for i in range(nb_k):
         results[i] = matrix[i].sum() / mask[i].sum()
     return results
 def disequilibrium(matrix1, matrix2, isGlobal=False):
     '''
     Disequilibrium matrix
     And Disequilibrium value
     '''
     mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
     result_human = result * 100
     result_power = np.power(result, 2)
     return (
         mask,
         result_human,
         disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
     )
-def compute_count_matrix(y_hat, y_truth):
+def compute_count_matrix(y_truth, y_hat):
     '''
         Check the size of the lists with assertion
     '''
     # Check size of the lists
     assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
     # Build count matrix
     count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
     for i in range(len(y_hat)):
         count_matrix[y_hat[i]][y_truth[i]] += 1
     return count_matrix
 def entropy_score(y_truth, y_hat):
     '''
     Need to use label encoder before givin y_hat and y_truth
     Don't use one hot labels
     Return a tuple with:
         - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
         - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
         - result : the final entropy measure of the clustering
     '''
     def divide_line(a, divider):
         '''
         Sub function used for dividing matrix by a vector line by line.
         '''
         return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
     # Build count matrix
-    count_matrix = compute_count_matrix(y_hat, y_truth)
+    count_matrix = compute_count_matrix(y_truth, y_hat)
     # Build dividers vector
     dividers = count_matrix.sum(axis=1)
     matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
     log_matrix = np.zeros(matrix_divided.shape)
     np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
     result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
     result_vector = result_matrix.sum(axis=1)
     result_vector.sum()
     if np.isnan(np.sum(result_vector)):
         print("COUNT MATRIX")
         print(count_matrix)
         print("MATRIX DIVIDED")
         print(matrix_divided)
         print("RESULT MATRIX")
         print(result_matrix)
         print("VECTOR MATRIX")
         print(result_vector)
         print("An error occured due to nan value, some values are printed before")
         exit(1)
     result = result_vector * dividers / dividers.sum()
     result = result.sum()
     return (result_matrix, result_vector, result)
+def purity_score(y_truth, y_hat):
+    def divide_line(a, divider):
+        '''
+        Sub function used for dividing matrix by a vector line by line.
+        '''
+        return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
+    def compute_purity_score(count_matrix, axis=0):
+        count_per_row = count_matrix.sum(axis=axis)
+        dividers = np.square(count_per_row)
+        count_matrix_squared = np.square(count_matrix)
+        matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix_squared, dtype=np.float), dividers)
+        vector_purity = np.sum(matrix_divided, axis=axis)
+        scalar_purity = np.average(vector_purity, weights=count_per_row)
+        return (vector_purity, scalar_purity)
+    count_matrix = compute_count_matrix(y_truth, y_hat)
+    _, purity_cluster_score = compute_purity_score(count_matrix, 1)
+    _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
+    K = np.sqrt(purity_cluster_score * purity_class_score)
+    for i in range(count_matrix.shape[0]):
+        for j in range(count_matrix.shape[1]):
+            count_matrix[i][j]
+        count_matrix[i]
+    return {
+        "purity_class_score": purity_class_score,
+        "purity_cluster_score": purity_cluster_score,
+        "K": K
+    }
 if __name__ == "__main__":
     # Hypothesis
     y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
     # Truth
     y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
-    (result_matrix, result_vector, result) = entropy(y, y_hat)
+    (result_matrix, result_vector, result) = entropy_score(y, y_hat)
+    print(purity_score(y, y_hat))
+    exit(1)
     print("Result matrix: ")
     print(result_matrix)
     print("Result vector: ")
     print(result_vector)
     print("Result: ", result)