purity measure added and tested

Mathias
1 parent 4ed3ebc7d7
Showing 1 changed file with 42 additions and 3 deletions Side-by-side Diff
volia/measures.py
@@ -90,7 +90,7 @@
     )
  
  
-def compute_count_matrix(y_hat, y_truth):
+def compute_count_matrix(y_truth, y_hat):
     '''
         Check the size of the lists with assertion
     '''
@@ -121,7 +121,7 @@
         return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
  
     # Build count matrix
-    count_matrix = compute_count_matrix(y_hat, y_truth)
+    count_matrix = compute_count_matrix(y_truth, y_hat)
  
     # Build dividers vector
     dividers = count_matrix.sum(axis=1)
  
  
  
@@ -151,15 +151,54 @@
     return (result_matrix, result_vector, result)
  
  
+def purity_score(y_truth, y_hat):
  
+    def divide_line(a, divider):
+        '''
+        Sub function used for dividing matrix by a vector line by line.
+        '''
+        return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
+
+    def compute_purity_score(count_matrix, axis=0):
+        count_per_row = count_matrix.sum(axis=axis)
+        dividers = np.square(count_per_row)
+        count_matrix_squared = np.square(count_matrix)
+        matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix_squared, dtype=np.float), dividers)
+        vector_purity = np.sum(matrix_divided, axis=axis)
+
+        scalar_purity = np.average(vector_purity, weights=count_per_row)
+        return (vector_purity, scalar_purity)
+    
+
+    count_matrix = compute_count_matrix(y_truth, y_hat)
+    _, purity_cluster_score = compute_purity_score(count_matrix, 1)
+    _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
+
+    K = np.sqrt(purity_cluster_score * purity_class_score)
+
+    for i in range(count_matrix.shape[0]):
+
+        for j in range(count_matrix.shape[1]):
+            count_matrix[i][j]
+        count_matrix[i]
+    return {
+        "purity_class_score": purity_class_score,
+        "purity_cluster_score": purity_cluster_score,
+        "K": K
+    }
+
+
 if __name__ == "__main__":
     # Hypothesis
     y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
     # Truth
     y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
  
-    (result_matrix, result_vector, result) = entropy(y, y_hat)
+    (result_matrix, result_vector, result) = entropy_score(y, y_hat)
  
+
+    print(purity_score(y, y_hat))
+    exit(1)
     print("Result matrix: ")
     print(result_matrix)
     print("Result vector: ")
...	...	@@ -90,7 +90,7 @@
90	90	)
91	91
92	92
93		-def compute_count_matrix(y_hat, y_truth):
	93	+def compute_count_matrix(y_truth, y_hat):
94	94	'''
95	95	Check the size of the lists with assertion
96	96	'''
...	...	@@ -121,7 +121,7 @@
121	121	return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
122	122
123	123	# Build count matrix
124		- count_matrix = compute_count_matrix(y_hat, y_truth)
	124	+ count_matrix = compute_count_matrix(y_truth, y_hat)
125	125
126	126	# Build dividers vector
127	127	dividers = count_matrix.sum(axis=1)
128	128
129	129
130	130
...	...	@@ -151,15 +151,54 @@
151	151	return (result_matrix, result_vector, result)
152	152
153	153
	154	+def purity_score(y_truth, y_hat):
154	155
	156	+ def divide_line(a, divider):
	157	+ '''
	158	+ Sub function used for dividing matrix by a vector line by line.
	159	+ '''
	160	+ return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
	161	+
	162	+ def compute_purity_score(count_matrix, axis=0):
	163	+ count_per_row = count_matrix.sum(axis=axis)
	164	+ dividers = np.square(count_per_row)
	165	+ count_matrix_squared = np.square(count_matrix)
	166	+ matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix_squared, dtype=np.float), dividers)
	167	+ vector_purity = np.sum(matrix_divided, axis=axis)
	168	+
	169	+ scalar_purity = np.average(vector_purity, weights=count_per_row)
	170	+ return (vector_purity, scalar_purity)
	171	+
	172	+
	173	+ count_matrix = compute_count_matrix(y_truth, y_hat)
	174	+ _, purity_cluster_score = compute_purity_score(count_matrix, 1)
	175	+ _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0)
	176	+
	177	+ K = np.sqrt(purity_cluster_score * purity_class_score)
	178	+
	179	+ for i in range(count_matrix.shape[0]):
	180	+
	181	+ for j in range(count_matrix.shape[1]):
	182	+ count_matrix[i][j]
	183	+ count_matrix[i]
	184	+ return {
	185	+ "purity_class_score": purity_class_score,
	186	+ "purity_cluster_score": purity_cluster_score,
	187	+ "K": K
	188	+ }
	189	+
	190	+
155	191	if __name__ == "__main__":
156	192	# Hypothesis
157	193	y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
158	194	# Truth
159	195	y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
160	196
161		- (result_matrix, result_vector, result) = entropy(y, y_hat)
	197	+ (result_matrix, result_vector, result) = entropy_score(y, y_hat)
162	198
	199	+
	200	+ print(purity_score(y, y_hat))
	201	+ exit(1)
163	202	print("Result matrix: ")
164	203	print(result_matrix)
165	204	print("Result vector: ")