Quillot Mathias / volia

1

'''

1

'''

2

This module is a part of my library.

2

This module is a part of my library.

3

It aims to compute some measures for clustering.

3

It aims to compute some measures for clustering.

4

'''

4

'''

5

6

import numpy as np

6

import numpy as np

7

8

def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):

8

def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):

9

'''

9

'''

10

Compute disequilibrium for all the clusters.

10

Compute disequilibrium for all the clusters.

11

The disequilibrium is compute from the difference

11

The disequilibrium is compute from the difference

12

between two clustering sets.

12

between two clustering sets.

13

isGlobal permet à l'utilisateur de choisir le dénominateur de

13

isGlobal permet à l'utilisateur de choisir le dénominateur de

14

la fonction :

14

la fonction :

15

- True : divise la valeur par le nombre d'élément du cluster

15

- True : divise la valeur par le nombre d'élément du cluster

16

- False : divise la valeur par le nombre d'élément total

16

- False : divise la valeur par le nombre d'élément total

17

18

withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou

18

withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou

19

une valeur absolue.

19

une valeur absolue.

20

'''

20

'''

21

22

def divide_line(a, divider):

22

def divide_line(a, divider):

23

'''

23

'''

24

Sub function used for dividing matrix by a vector line by line.

24

Sub function used for dividing matrix by a vector line by line.

25

'''

25

'''

26

return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)

26

return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)

27

28

dividers1 = 0

28

dividers1 = 0

29

dividers2 = 0

29

dividers2 = 0

30

31

if isGlobal:

31

if isGlobal:

32

dividers1 = matrix1.sum()

32

dividers1 = matrix1.sum()

33

dividers2 = matrix2.sum()

33

dividers2 = matrix2.sum()

34

else:

34

else:

35

dividers1 = matrix1.sum(axis=1)

35

dividers1 = matrix1.sum(axis=1)

36

dividers2 = matrix2.sum(axis=1)

36

dividers2 = matrix2.sum(axis=1)

37

38

matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)

38

matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)

39

40

matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)

40

matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)

41

42

diff = matrix1_divided - matrix2_divided

42

diff = matrix1_divided - matrix2_divided

43

44

mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))

44

mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))

45

46

result = diff

46

result = diff

47

48

if mod != None or mod == "":

48

if mod != None or mod == "":

49

for word in mod.split(" "):

49

for word in mod.split(" "):

50

if word == "power":

50

if word == "power":

51

result = np.power(result,2)

51

result = np.power(result,2)

52

elif word == "human":

52

elif word == "human":

53

result = result * 100

53

result = result * 100

54

elif word == "abs":

54

elif word == "abs":

55

result = np.absolute(result)

55

result = np.absolute(result)

56

else:

56

else:

57

raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")

57

raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")

58

return (mask, result)

58

return (mask, result)

59

60

61

62

def disequilibrium_mean_by_cluster(mask, matrix):

62

def disequilibrium_mean_by_cluster(mask, matrix):

63

'''

63

'''

64

Mean of disequilibrium

64

Mean of disequilibrium

65

matrix is the disequilibrium calculated

65

matrix is the disequilibrium calculated

66

from number of occurences belonging to a class,

66

from number of occurences belonging to a class,

67

for each cluster.

67

for each cluster.

68

'''

68

'''

69

nb_k = len(matrix)

69

nb_k = len(matrix)

70

results = np.zeros((nb_k))

70

results = np.zeros((nb_k))

71

72

for i in range(nb_k):

72

for i in range(nb_k):

73

results[i] = matrix[i].sum() / mask[i].sum()

73

results[i] = matrix[i].sum() / mask[i].sum()

74

return results

74

return results

75

76

77

def disequilibrium(matrix1, matrix2, isGlobal=False):

77

def disequilibrium(matrix1, matrix2, isGlobal=False):

78

'''

78

'''

79

Disequilibrium matrix

79

Disequilibrium matrix

80

And Disequilibrium value

80

And Disequilibrium value

81

'''

81

'''

82

mask, result = disequilibrium_(matrix1, matrix2, isGlobal)

82

mask, result = disequilibrium_(matrix1, matrix2, isGlobal)

83

result_human = result * 100

83

result_human = result * 100

84

result_power = np.power(result, 2)

84

result_power = np.power(result, 2)

85

86

return (

86

return (

87

mask,

87

mask,

88

result_human,

88

result_human,

89

disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]

89

disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]

90

)

90

)

91

92

93

def compute_count_matrix(y_truth, y_hat):

93

def compute_count_matrix(y_truth, y_hat):

94

'''

94

'''

95

Check the size of the lists with assertion

95

Check the size of the lists with assertion

96

'''

96

'''

97

# Check size of the lists

97

# Check size of the lists

98

assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"

98

assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"

99

100

# Build count matrix

100

# Build count matrix

101

count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))

101

count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))

102

for i in range(len(y_hat)):

102

for i in range(len(y_hat)):

103

count_matrix[y_hat[i]][y_truth[i]] += 1

103

count_matrix[y_hat[i]][y_truth[i]] += 1

104

return count_matrix

104

return count_matrix

105

106

107

def entropy_score(y_truth, y_hat):

107

def entropy_score(y_truth, y_hat):

108

'''

108

'''

109

Need to use label encoder before givin y_hat and y_truth

109

Need to use label encoder before givin y_hat and y_truth

110

Don't use one hot labels

110

Don't use one hot labels

111

112

Return a tuple with:

112

Return a tuple with:

113

- result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))

113

- result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))

114

- result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.

114

- result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.

115

- result : the final entropy measure of the clustering

115

- result : the final entropy measure of the clustering

116

'''

116

'''

117

def divide_line(a, divider):

117

def divide_line(a, divider):

118

'''

118

'''

119

Sub function used for dividing matrix by a vector line by line.

119

Sub function used for dividing matrix by a vector line by line.

120

'''

120

'''

121

return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)

121

return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)

122

123

# Build count matrix

123

# Build count matrix

124

count_matrix = compute_count_matrix(y_truth, y_hat)

124

count_matrix = compute_count_matrix(y_truth, y_hat)

125

126

# Build dividers vector

126

# Build dividers vector

127

dividers = count_matrix.sum(axis=1)

127

dividers = count_matrix.sum(axis=1)

128

129

matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)

129

matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)

130

131

log_matrix = np.zeros(matrix_divided.shape)

131

log_matrix = np.zeros(matrix_divided.shape)

132

np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)

132

np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)

133

result_matrix = -1 * np.multiply(matrix_divided, log_matrix)

133

result_matrix = -1 * np.multiply(matrix_divided, log_matrix)

134

result_vector = result_matrix.sum(axis=1)

134

result_vector = result_matrix.sum(axis=1)

135

result_vector.sum()

135

result_vector.sum()

136

137

if np.isnan(np.sum(result_vector)):

137

if np.isnan(np.sum(result_vector)):

138

print("COUNT MATRIX")

138

print("COUNT MATRIX")

139

print(count_matrix)

139

print(count_matrix)

140

print("MATRIX DIVIDED")

140

print("MATRIX DIVIDED")

141

print(matrix_divided)

141

print(matrix_divided)

142

print("RESULT MATRIX")

142

print("RESULT MATRIX")

143

print(result_matrix)

143

print(result_matrix)

144

print("VECTOR MATRIX")

144

print("VECTOR MATRIX")

145

print(result_vector)

145

print(result_vector)

146

print("An error occured due to nan value, some values are printed before")

146

print("An error occured due to nan value, some values are printed before")

147

exit(1)

147

exit(1)

148

149

result = result_vector * dividers / dividers.sum()

149

result = result_vector * dividers / dividers.sum()

150

result = result.sum()

150

result = result.sum()

151

return (result_matrix, result_vector, result)

151

return (result_matrix, result_vector, result)

152

153

154

def purity_score(y_truth, y_hat):

154

def purity_score(y_truth, y_hat):

155

'''

156

Return three values in a dictionary:

157

- purity_class_score: the purity score of the class (asp)

158

- purity_cluster_score: the purity score of the cluster (acp)

159

- K: the overall evaluation criterion (sqrt(asp * acp))

160

161

This function is based on the following article:

162

Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan

163

'''

155

164

156

def divide_line(a, divider):

165

def divide_line(a, divider):

157

'''

166

'''

158

Sub function used for dividing matrix by a vector line by line.

167

Sub function used for dividing matrix by a vector line by line.

159

'''

168

'''

160

return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)

169

return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)

161

170

162

def compute_purity_score(count_matrix, axis=0):

171

def compute_purity_score(count_matrix, axis=0):

163

count_per_row = count_matrix.sum(axis=axis)

172

count_per_row = count_matrix.sum(axis=axis)

164

dividers = np.square(count_per_row)

173

dividers = np.square(count_per_row)

165

count_matrix_squared = np.square(count_matrix)

174

count_matrix_squared = np.square(count_matrix)

166

matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix_squared, dtype=np.float), dividers)

175

matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix_squared, dtype=np.float), dividers)

167

vector_purity = np.sum(matrix_divided, axis=axis)

176

vector_purity = np.sum(matrix_divided, axis=axis)

168

177

169

scalar_purity = np.average(vector_purity, weights=count_per_row)

178

scalar_purity = np.average(vector_purity, weights=count_per_row)

170

return (vector_purity, scalar_purity)

179

return (vector_purity, scalar_purity)

171

180

172

181

173

count_matrix = compute_count_matrix(y_truth, y_hat)

182

count_matrix = compute_count_matrix(y_truth, y_hat)

174

_, purity_cluster_score = compute_purity_score(count_matrix, 1)

183

_, purity_cluster_score = compute_purity_score(count_matrix, 1)

175

_, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)

184

_, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)

176

185

177

K = np.sqrt(purity_cluster_score * purity_class_score)

186

K = np.sqrt(purity_cluster_score * purity_class_score)

178

187

179

for i in range(count_matrix.shape[0]):

188

for i in range(count_matrix.shape[0]):

180

189

181

for j in range(count_matrix.shape[1]):

190

for j in range(count_matrix.shape[1]):

182

count_matrix[i][j]

191

count_matrix[i][j]

183

count_matrix[i]

192

count_matrix[i]

184

return {

193

return {

185

"purity_class_score": purity_class_score,

194

"purity_class_score": purity_class_score,

186

"purity_cluster_score": purity_cluster_score,

195

"purity_cluster_score": purity_cluster_score,

187

"K": K

196

"K": K

188

}

197

}

189

198

190

199

191

if __name__ == "__main__":

200

if __name__ == "__main__":

192

# Hypothesis

201

# Hypothesis

193

y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])

202

y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])

194

# Truth

203

# Truth

195

y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])

204

y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])

196

205

197

(result_matrix, result_vector, result) = entropy_score(y, y_hat)

206

(result_matrix, result_vector, result) = entropy_score(y, y_hat)

198

207

199

208

200

print(purity_score(y, y_hat))

209

print(purity_score(y, y_hat))

201

exit(1)

210

exit(1)

202

print("Result matrix: ")

211

print("Result matrix: ")

203

print(result_matrix)

212

print(result_matrix)

204

print("Result vector: ")

213

print("Result vector: ")

205

print(result_vector)

214

print(result_vector)

206

print("Result: ", result)

215

print("Result: ", result)

GITLAB

Quillot Mathias / volia

Add comments to the purity_score function

 '''
 This module is a part of my library.
 It aims to compute some measures for clustering.
 '''
 import numpy as np
 def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
     '''
     Compute disequilibrium for all the clusters.
     The disequilibrium is compute from the difference
     between two clustering sets.
     isGlobal permet à l'utilisateur de choisir le dénominateur de
     la fonction :
         - True : divise la valeur par le nombre d'élément du cluster
         - False : divise la valeur par le nombre d'élément total
     withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
     une valeur absolue.
     '''
     def divide_line(a, divider):
         '''
         Sub function used for dividing matrix by a vector line by line.
         '''
         return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
     dividers1 = 0
     dividers2 = 0
     if isGlobal:
         dividers1 = matrix1.sum()
         dividers2 = matrix2.sum()
     else:
         dividers1 = matrix1.sum(axis=1)
         dividers2 = matrix2.sum(axis=1)
     matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
     matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
     diff = matrix1_divided - matrix2_divided
     mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
     result = diff
     if mod != None or mod == "":
         for word in mod.split(" "):
             if word == "power":
                 result = np.power(result,2)
             elif word == "human":
                 result = result * 100
             elif word == "abs":
                 result = np.absolute(result)
             else:
                 raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
     return (mask, result)
 def disequilibrium_mean_by_cluster(mask, matrix):
     '''
     Mean of disequilibrium
     matrix is the disequilibrium calculated
     from number of occurences belonging to a class,
     for each cluster.
     '''
     nb_k = len(matrix)
     results = np.zeros((nb_k))
     for i in range(nb_k):
         results[i] = matrix[i].sum() / mask[i].sum()
     return results
 def disequilibrium(matrix1, matrix2, isGlobal=False):
     '''
     Disequilibrium matrix
     And Disequilibrium value
     '''
     mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
     result_human = result * 100
     result_power = np.power(result, 2)
     return (
         mask,
         result_human,
         disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
     )
 def compute_count_matrix(y_truth, y_hat):
     '''
         Check the size of the lists with assertion
     '''
     # Check size of the lists
     assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
     # Build count matrix
     count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
     for i in range(len(y_hat)):
         count_matrix[y_hat[i]][y_truth[i]] += 1
     return count_matrix
 def entropy_score(y_truth, y_hat):
     '''
     Need to use label encoder before givin y_hat and y_truth
     Don't use one hot labels
     Return a tuple with:
         - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
         - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
         - result : the final entropy measure of the clustering
     '''
     def divide_line(a, divider):
         '''
         Sub function used for dividing matrix by a vector line by line.
         '''
         return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
     # Build count matrix
     count_matrix = compute_count_matrix(y_truth, y_hat)
     # Build dividers vector
     dividers = count_matrix.sum(axis=1)
     matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
     log_matrix = np.zeros(matrix_divided.shape)
     np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
     result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
     result_vector = result_matrix.sum(axis=1)
     result_vector.sum()
     if np.isnan(np.sum(result_vector)):
         print("COUNT MATRIX")
         print(count_matrix)
         print("MATRIX DIVIDED")
         print(matrix_divided)
         print("RESULT MATRIX")
         print(result_matrix)
         print("VECTOR MATRIX")
         print(result_vector)
         print("An error occured due to nan value, some values are printed before")
         exit(1)
     result = result_vector * dividers / dividers.sum()
     result = result.sum()
     return (result_matrix, result_vector, result)
 def purity_score(y_truth, y_hat):
+    '''
+    Return three values in a dictionary:
+        - purity_class_score: the purity score of the class (asp)
+        - purity_cluster_score: the purity score of the cluster (acp)
+        - K: the overall evaluation criterion (sqrt(asp * acp))
+    This function is based on the following article:
+    Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan
+    '''
     def divide_line(a, divider):
         '''
         Sub function used for dividing matrix by a vector line by line.
         '''
         return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
     def compute_purity_score(count_matrix, axis=0):
         count_per_row = count_matrix.sum(axis=axis)
         dividers = np.square(count_per_row)
         count_matrix_squared = np.square(count_matrix)
         matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix_squared, dtype=np.float), dividers)
         vector_purity = np.sum(matrix_divided, axis=axis)
         scalar_purity = np.average(vector_purity, weights=count_per_row)
         return (vector_purity, scalar_purity)
     count_matrix = compute_count_matrix(y_truth, y_hat)
     _, purity_cluster_score = compute_purity_score(count_matrix, 1)
     _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
     K = np.sqrt(purity_cluster_score * purity_class_score)
     for i in range(count_matrix.shape[0]):
         for j in range(count_matrix.shape[1]):
             count_matrix[i][j]
         count_matrix[i]
     return {
         "purity_class_score": purity_class_score,
         "purity_cluster_score": purity_cluster_score,
         "K": K
     }
 if __name__ == "__main__":
     # Hypothesis
     y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
     # Truth
     y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
     (result_matrix, result_vector, result) = entropy_score(y, y_hat)
     print(purity_score(y, y_hat))
     exit(1)
     print("Result matrix: ")
     print(result_matrix)
     print("Result vector: ")
     print(result_vector)
     print("Result: ", result)