Commit aeff19f9513b6f3f650ae1c36eff462c545fbf52

1 parent
4ed3ebc7d7

Exists in
master

### purity measure added and tested

Showing
**1 changed file**
with
**42 additions**
and
**3 deletions**
Side-by-side Diff

volia/measures.py

... | ... | @@ -90,7 +90,7 @@ |

90 | 90 | ) |

91 | 91 | |

92 | 92 | |

93 | -def compute_count_matrix(y_hat, y_truth): | |

93 | +def compute_count_matrix(y_truth, y_hat): | |

94 | 94 | ''' |

95 | 95 | Check the size of the lists with assertion |

96 | 96 | ''' |

... | ... | @@ -121,7 +121,7 @@ |

121 | 121 | return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) |

122 | 122 | |

123 | 123 | # Build count matrix |

124 | - count_matrix = compute_count_matrix(y_hat, y_truth) | |

124 | + count_matrix = compute_count_matrix(y_truth, y_hat) | |

125 | 125 | |

126 | 126 | # Build dividers vector |

127 | 127 | dividers = count_matrix.sum(axis=1) |

128 | 128 | |

129 | 129 | |

130 | 130 | |

... | ... | @@ -151,15 +151,54 @@ |

151 | 151 | return (result_matrix, result_vector, result) |

152 | 152 | |

153 | 153 | |

154 | +def purity_score(y_truth, y_hat): | |

154 | 155 | |

156 | + def divide_line(a, divider): | |

157 | + ''' | |

158 | + Sub function used for dividing matrix by a vector line by line. | |

159 | + ''' | |

160 | + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) | |

161 | + | |

162 | + def compute_purity_score(count_matrix, axis=0): | |

163 | + count_per_row = count_matrix.sum(axis=axis) | |

164 | + dividers = np.square(count_per_row) | |

165 | + count_matrix_squared = np.square(count_matrix) | |

166 | + matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix_squared, dtype=np.float), dividers) | |

167 | + vector_purity = np.sum(matrix_divided, axis=axis) | |

168 | + | |

169 | + scalar_purity = np.average(vector_purity, weights=count_per_row) | |

170 | + return (vector_purity, scalar_purity) | |

171 | + | |

172 | + | |

173 | + count_matrix = compute_count_matrix(y_truth, y_hat) | |

174 | + _, purity_cluster_score = compute_purity_score(count_matrix, 1) | |

175 | + _, purity_class_score = cluster_purity = compute_purity_score(count_matrix, 0) | |

176 | + | |

177 | + K = np.sqrt(purity_cluster_score * purity_class_score) | |

178 | + | |

179 | + for i in range(count_matrix.shape[0]): | |

180 | + | |

181 | + for j in range(count_matrix.shape[1]): | |

182 | + count_matrix[i][j] | |

183 | + count_matrix[i] | |

184 | + return { | |

185 | + "purity_class_score": purity_class_score, | |

186 | + "purity_cluster_score": purity_cluster_score, | |

187 | + "K": K | |

188 | + } | |

189 | + | |

190 | + | |

155 | 191 | if __name__ == "__main__": |

156 | 192 | # Hypothesis |

157 | 193 | y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0]) |

158 | 194 | # Truth |

159 | 195 | y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]) |

160 | 196 | |

161 | - (result_matrix, result_vector, result) = entropy(y, y_hat) | |

197 | + (result_matrix, result_vector, result) = entropy_score(y, y_hat) | |

162 | 198 | |

199 | + | |

200 | + print(purity_score(y, y_hat)) | |

201 | + exit(1) | |

163 | 202 | print("Result matrix: ") |

164 | 203 | print(result_matrix) |

165 | 204 | print("Result vector: ") |