Quillot Mathias / volia

1

'''

1

'''

2

This module is a part of my library.

2

This module is a part of my library.

3

It aims to compute some measures for clustering.

3

It aims to compute some measures for clustering.

4

'''

4

'''

5

6

import numpy as np

6

import numpy as np

7

8

def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):

8

def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):

9

'''

9

'''

10

Compute disequilibrium for all the clusters.

10

Compute disequilibrium for all the clusters.

11

The disequilibrium is compute from the difference

11

The disequilibrium is compute from the difference

12

between two clustering sets.

12

between two clustering sets.

13

isGlobal permet à l'utilisateur de choisir le dénominateur de

13

isGlobal permet à l'utilisateur de choisir le dénominateur de

14

la fonction :

14

la fonction :

15

- True : divise la valeur par le nombre d'élément du cluster

15

- True : divise la valeur par le nombre d'élément du cluster

16

- False : divise la valeur par le nombre d'élément total

16

- False : divise la valeur par le nombre d'élément total

17

18

withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou

18

withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou

19

une valeur absolue.

19

une valeur absolue.

20

'''

20

'''

21

22

def divide_line(a, divider):

22

def divide_line(a, divider):

23

'''

23

'''

24

Sub function used for dividing matrix by a vector line by line.

24

Sub function used for dividing matrix by a vector line by line.

25

'''

25

'''

26

return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)

26

return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)

27

28

dividers1 = 0

28

dividers1 = 0

29

dividers2 = 0

29

dividers2 = 0

30

31

if isGlobal:

31

if isGlobal:

32

dividers1 = matrix1.sum()

32

dividers1 = matrix1.sum()

33

dividers2 = matrix2.sum()

33

dividers2 = matrix2.sum()

34

else:

34

else:

35

dividers1 = matrix1.sum(axis=1)

35

dividers1 = matrix1.sum(axis=1)

36

dividers2 = matrix2.sum(axis=1)

36

dividers2 = matrix2.sum(axis=1)

37

38

matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)

38

matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)

39

40

matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)

40

matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)

41

42

diff = matrix1_divided - matrix2_divided

42

diff = matrix1_divided - matrix2_divided

43

44

mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))

44

mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))

45

46

result = diff

46

result = diff

47

48

if mod != None or mod == "":

48

if mod != None or mod == "":

49

for word in mod.split(" "):

49

for word in mod.split(" "):

50

if word == "power":

50

if word == "power":

51

result = np.power(result,2)

51

result = np.power(result,2)

52

elif word == "human":

52

elif word == "human":

53

result = result * 100

53

result = result * 100

54

elif word == "abs":

54

elif word == "abs":

55

result = np.absolute(result)

55

result = np.absolute(result)

56

else:

56

else:

57

raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")

57

raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")

58

return (mask, result)

58

return (mask, result)

59

60

61

62

def disequilibrium_mean_by_cluster(mask, matrix):

62

def disequilibrium_mean_by_cluster(mask, matrix):

63

'''

63

'''

64

Mean of disequilibrium

64

Mean of disequilibrium

65

matrix is the disequilibrium calculated

65

matrix is the disequilibrium calculated

66

from number of occurences belonging to a class,

66

from number of occurences belonging to a class,

67

for each cluster.

67

for each cluster.

68

'''

68

'''

69

nb_k = len(matrix)

69

nb_k = len(matrix)

70

results = np.zeros((nb_k))

70

results = np.zeros((nb_k))

71

72

for i in range(nb_k):

72

for i in range(nb_k):

73

results[i] = matrix[i].sum() / mask[i].sum()

73

results[i] = matrix[i].sum() / mask[i].sum()

74

return results

74

return results

75

76

77

def disequilibrium(matrix1, matrix2, isGlobal=False):

77

def disequilibrium(matrix1, matrix2, isGlobal=False):

78

'''

78

'''

79

Disequilibrium matrix

79

Disequilibrium matrix

80

And Disequilibrium value

80

And Disequilibrium value

81

'''

81

'''

82

mask, result = disequilibrium_(matrix1, matrix2, isGlobal)

82

mask, result = disequilibrium_(matrix1, matrix2, isGlobal)

83

result_human = result * 100

83

result_human = result * 100

84

result_power = np.power(result, 2)

84

result_power = np.power(result, 2)

85

86

return (

86

return (

87

mask,

87

mask,

88

result_human,

88

result_human,

89

disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]

89

disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]

90

)

90

)

91

92

93

def compute_count_matrix(y_truth, y_hat):

93

def compute_count_matrix(y_truth, y_hat):

94

'''

94

'''

95

Check the size of the lists with assertion

95

Check the size of the lists with assertion

96

'''

96

'''

97

# Check size of the lists

97

# Check size of the lists

98

assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"

98

assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"

99

100

# Build count matrix

100

# Build count matrix

101

count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))

101

count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))

102

for i in range(len(y_hat)):

102

for i in range(len(y_hat)):

103

count_matrix[y_hat[i]][y_truth[i]] += 1

103

count_matrix[y_hat[i]][y_truth[i]] += 1

104

return count_matrix

104

return count_matrix

105

106

107

def entropy_score(y_truth, y_hat):

107

def entropy_score(y_truth, y_hat):

108

'''

108

'''

109

Need to use label encoder before givin y_hat and y_truth

109

Need to use label encoder before givin y_hat and y_truth

110

Don't use one hot labels

110

Don't use one hot labels

111

112

Return a tuple with:

112

Return a tuple with:

113

- result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))

113

- result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))

114

- result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.

114

- result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.

115

- result : the final entropy measure of the clustering

115

- result : the final entropy measure of the clustering

116

'''

116

'''

117

def divide_line(a, divider):

117

def divide_line(a, divider):

118

'''

118

'''

119

Sub function used for dividing matrix by a vector line by line.

119

Sub function used for dividing matrix by a vector line by line.

120

'''

120

'''

121

return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)

121

return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)

122

123

# Build count matrix

123

# Build count matrix

124

count_matrix = compute_count_matrix(y_truth, y_hat)

124

count_matrix = compute_count_matrix(y_truth, y_hat)

125

126

# Build dividers vector

126

# Build dividers vector

127

dividers = count_matrix.sum(axis=1)

127

dividers = count_matrix.sum(axis=1)

128

129

matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)

129

matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)

130

131

log_matrix = np.zeros(matrix_divided.shape)

131

log_matrix = np.zeros(matrix_divided.shape)

132

np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)

132

np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)

133

result_matrix = -1 * np.multiply(matrix_divided, log_matrix)

133

result_matrix = -1 * np.multiply(matrix_divided, log_matrix)

134

result_vector = result_matrix.sum(axis=1)

134

result_vector = result_matrix.sum(axis=1)

135

result_vector.sum()

135

result_vector.sum()

136

137

if np.isnan(np.sum(result_vector)):

137

if np.isnan(np.sum(result_vector)):

138

print("COUNT MATRIX")

138

print("COUNT MATRIX")

139

print(count_matrix)

139

print(count_matrix)

140

print("MATRIX DIVIDED")

140

print("MATRIX DIVIDED")

141

print(matrix_divided)

141

print(matrix_divided)

142

print("RESULT MATRIX")

142

print("RESULT MATRIX")

143

print(result_matrix)

143

print(result_matrix)

144

print("VECTOR MATRIX")

144

print("VECTOR MATRIX")

145

print(result_vector)

145

print(result_vector)

146

print("An error occured due to nan value, some values are printed before")

146

print("An error occured due to nan value, some values are printed before")

147

exit(1)

147

exit(1)

148

149

result = result_vector * dividers / dividers.sum()

149

result = result_vector * dividers / dividers.sum()

150

result = result.sum()

150

result = result.sum()

151

return (result_matrix, result_vector, result)

151

return (result_matrix, result_vector, result)

152

153

154

def purity_score(y_truth, y_hat):

154

def purity_score(y_truth, y_hat):

155

'''

155

'''

156

Return three values in a dictionary:

156

Return three values in a dictionary:

157

- purity_class_score: the purity score of the class (asp)

157

- purity_class_score: the purity score of the class (asp)

158

- purity_cluster_score: the purity score of the cluster (acp)

158

- purity_cluster_score: the purity score of the cluster (acp)

159

- K: the overall evaluation criterion (sqrt(asp * acp))

159

- K: the overall evaluation criterion (sqrt(asp * acp))

160

161

This function is based on the following article:

161

This function is based on the following article:

162

Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan

162

Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan

163

'''

163

'''

164

165

def divide_line(a, divider):

165

def divide_line(a, divider):

166

'''

166

'''

167

Sub function used for dividing matrix by a vector line by line.

167

Sub function used for dividing matrix by a vector line by line.

168

'''

168

'''

169

return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)

169

return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)

170

171

def compute_purity_score(count_matrix, axis=0):

171

def compute_purity_score(count_matrix, axis=0):

172

if axis==0:

173

other_axis = 1

174

else:

175

other_axis = 0

172

count_per_row = count_matrix.sum(axis=axis)

176

count_per_row = count_matrix.sum(axis=axis)

173

dividers = np.square(count_per_row)

177

dividers = np.square(count_per_row)

178

174

count_matrix_squared = np.square(count_matrix)

179

count_matrix_squared = np.square(count_matrix)

175

matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix_squared, dtype=np.float), dividers)

180

matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers)

176

vector_purity = np.sum(matrix_divided, axis=axis)

181

vector_purity = np.sum(matrix_divided, axis=axis)

177

182

178

scalar_purity = np.average(vector_purity, weights=count_per_row)

183

scalar_purity = np.average(vector_purity, weights=count_per_row)

179

return (vector_purity, scalar_purity)

184

return (vector_purity, scalar_purity)

180

185

181

186

182

count_matrix = compute_count_matrix(y_truth, y_hat)

187

count_matrix = compute_count_matrix(y_truth, y_hat)

183

_, purity_cluster_score = compute_purity_score(count_matrix, 1)

188

_, purity_cluster_score = compute_purity_score(count_matrix, 1)

184

_, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)

189

_, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)

185

190

186

K = np.sqrt(purity_cluster_score * purity_class_score)

191

K = np.sqrt(purity_cluster_score * purity_class_score)

187

192

188

for i in range(count_matrix.shape[0]):

193

for i in range(count_matrix.shape[0]):

189

190

for j in range(count_matrix.shape[1]):

194

for j in range(count_matrix.shape[1]):

191

count_matrix[i][j]

195

count_matrix[i][j]

192

count_matrix[i]

196

count_matrix[i]

193

return {

197

return {

194

"purity_class_score": purity_class_score,

198

"purity_class_score": purity_class_score,

195

"purity_cluster_score": purity_cluster_score,

199

"purity_cluster_score": purity_cluster_score,

196

"K": K

200

"K": K

197

}

201

}

198

202

199

203

200

if __name__ == "__main__":

204

if __name__ == "__main__":

205

print("Purity test #1")

201

# Hypothesis

206

# Hypothesis

202

y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])

207

y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])

203

# Truth

208

# Truth

204

y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])

209

y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])

205

210

206

(result_matrix, result_vector, result) = entropy_score(y, y_hat)

211

(result_matrix, result_vector, result) = entropy_score(y, y_hat)

212

print(purity_score(y, y_hat))

207

213

214

exit(1)

215

print("Purity test #2")

216

# Hypothesis

217

y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4])

218

# Truth

219

y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3])

208

220

209

print(purity_score(y, y_hat))

221

(result_matrix, result_vector, result) = entropy_score(y, y_hat)

210

exit(1)

222

exit(1)

211

print("Result matrix: ")

223

print("Result matrix: ")

212

print(result_matrix)

224

print(result_matrix)

213

print("Result vector: ")

225

print("Result vector: ")

214

print(result_vector)

226

print(result_vector)

GITLAB

Quillot Mathias / volia

Repair error about the definition of the axis for the multiplication

 '''
 This module is a part of my library.
 It aims to compute some measures for clustering.
 '''
 import numpy as np
 def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
     '''
     Compute disequilibrium for all the clusters.
     The disequilibrium is compute from the difference
     between two clustering sets.
     isGlobal permet à l'utilisateur de choisir le dénominateur de
     la fonction :
         - True : divise la valeur par le nombre d'élément du cluster
         - False : divise la valeur par le nombre d'élément total
     withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
     une valeur absolue.
     '''
     def divide_line(a, divider):
         '''
         Sub function used for dividing matrix by a vector line by line.
         '''
         return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
     dividers1 = 0
     dividers2 = 0
     if isGlobal:
         dividers1 = matrix1.sum()
         dividers2 = matrix2.sum()
     else:
         dividers1 = matrix1.sum(axis=1)
         dividers2 = matrix2.sum(axis=1)
     matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
     matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
     diff = matrix1_divided - matrix2_divided
     mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
     result = diff
     if mod != None or mod == "":
         for word in mod.split(" "):
             if word == "power":
                 result = np.power(result,2)
             elif word == "human":
                 result = result * 100
             elif word == "abs":
                 result = np.absolute(result)
             else:
                 raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
     return (mask, result)
 def disequilibrium_mean_by_cluster(mask, matrix):
     '''
     Mean of disequilibrium
     matrix is the disequilibrium calculated
     from number of occurences belonging to a class,
     for each cluster.
     '''
     nb_k = len(matrix)
     results = np.zeros((nb_k))
     for i in range(nb_k):
         results[i] = matrix[i].sum() / mask[i].sum()
     return results
 def disequilibrium(matrix1, matrix2, isGlobal=False):
     '''
     Disequilibrium matrix
     And Disequilibrium value
     '''
     mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
     result_human = result * 100
     result_power = np.power(result, 2)
     return (
         mask,
         result_human,
         disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
     )
 def compute_count_matrix(y_truth, y_hat):
     '''
         Check the size of the lists with assertion
     '''
     # Check size of the lists
     assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
     # Build count matrix
     count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
     for i in range(len(y_hat)):
         count_matrix[y_hat[i]][y_truth[i]] += 1
     return count_matrix
 def entropy_score(y_truth, y_hat):
     '''
     Need to use label encoder before givin y_hat and y_truth
     Don't use one hot labels
     Return a tuple with:
         - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
         - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
         - result : the final entropy measure of the clustering
     '''
     def divide_line(a, divider):
         '''
         Sub function used for dividing matrix by a vector line by line.
         '''
         return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
     # Build count matrix
     count_matrix = compute_count_matrix(y_truth, y_hat)
     # Build dividers vector
     dividers = count_matrix.sum(axis=1)
     matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
     log_matrix = np.zeros(matrix_divided.shape)
     np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
     result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
     result_vector = result_matrix.sum(axis=1)
     result_vector.sum()
     if np.isnan(np.sum(result_vector)):
         print("COUNT MATRIX")
         print(count_matrix)
         print("MATRIX DIVIDED")
         print(matrix_divided)
         print("RESULT MATRIX")
         print(result_matrix)
         print("VECTOR MATRIX")
         print(result_vector)
         print("An error occured due to nan value, some values are printed before")
         exit(1)
     result = result_vector * dividers / dividers.sum()
     result = result.sum()
     return (result_matrix, result_vector, result)
 def purity_score(y_truth, y_hat):
     '''
     Return three values in a dictionary:
         - purity_class_score: the purity score of the class (asp)
         - purity_cluster_score: the purity score of the cluster (acp)
         - K: the overall evaluation criterion (sqrt(asp * acp))
     This function is based on the following article:
     Unknown-multiple speaker clustering using HMM, J. Ajmera, H. Bourlard, I. Lapidot, I. McCowan
     '''
     def divide_line(a, divider):
         '''
         Sub function used for dividing matrix by a vector line by line.
         '''
         return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
     def compute_purity_score(count_matrix, axis=0):
+        if axis==0:
+            other_axis = 1
+        else:
+            other_axis = 0
         count_per_row = count_matrix.sum(axis=axis)
         dividers = np.square(count_per_row)
         count_matrix_squared = np.square(count_matrix)
-        matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix_squared, dtype=np.float), dividers)
+        matrix_divided = np.apply_along_axis(divide_line, other_axis, np.asarray(count_matrix_squared, dtype=np.float), dividers)
         vector_purity = np.sum(matrix_divided, axis=axis)
         scalar_purity = np.average(vector_purity, weights=count_per_row)
         return (vector_purity, scalar_purity)
     count_matrix = compute_count_matrix(y_truth, y_hat)
     _, purity_cluster_score = compute_purity_score(count_matrix, 1)
     _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
     K = np.sqrt(purity_cluster_score * purity_class_score)
     for i in range(count_matrix.shape[0]):
         for j in range(count_matrix.shape[1]):
             count_matrix[i][j]
         count_matrix[i]
     return {
         "purity_class_score": purity_class_score,
         "purity_cluster_score": purity_cluster_score,
         "K": K
     }
 if __name__ == "__main__":
+    print("Purity test #1")
     # Hypothesis
     y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
     # Truth
     y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
     (result_matrix, result_vector, result) = entropy_score(y, y_hat)
+    print(purity_score(y, y_hat))
+    exit(1)
+    print("Purity test #2")
+    # Hypothesis
+    y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0, 4, 4, 4])
+    # Truth
+    y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 0, 3, 3, 3])
-    print(purity_score(y, y_hat))
+    (result_matrix, result_vector, result) = entropy_score(y, y_hat)
     exit(1)
     print("Result matrix: ")
     print(result_matrix)
     print("Result vector: ")
     print(result_vector)