### Blame view

bin/measures.py 3.16 KB
 ```1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105``` ``` ''' This module is a part of my library. It aims to compute some measures for clustering. ''' import numpy as np def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None): ''' Compute disequilibrium for all the clusters. The disequilibrium is compute from the difference between two clustering sets. isGlobal permet à l'utilisateur de choisir le dénominateur de la fonction : - True : divise la valeur par le nombre d'élément du cluster - False : divise la valeur par le nombre d'élément total withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou une valeur absolue. ''' def divide_line(a, divider): ''' Sub function used for dividing matrix by a vector line by line. ''' return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) dividers1 = 0 dividers2 = 0 if isGlobal: dividers1 = matrix1.sum() dividers2 = matrix2.sum() else: dividers1 = matrix1.sum(axis=1) dividers2 = matrix2.sum(axis=1) matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1) matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2) diff = matrix1_divided - matrix2_divided mask = (matrix2==0) & (matrix1==0) result = diff if mod != None or mod == "": for word in mod.split(" "): if word == "power": result = np.power(result,2) elif word == "human": result = result * 100 elif word == "abs": result = np.absolute(result) else: raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"") return (mask, result) def disequilibrium_mean_by_cluster(mask, matrix): ''' Mean of disequilibrium matrix is the disequilibrium calculated from number of occurences belonging to a class, for each cluster. ''' nb_k = len(matrix) results = np.zeros((nb_k)) for i in range(nb_k): results[i] = matrix[i].sum() / mask[i].sum() return results def disequilibrium(matrix1, matrix2, isGlobal=False): ''' Disequilibrium matrix And Disequilibrium value ''' mask, result = disequilibrium_(matrix1, matrix2, isGlobal) result_human = result * 100 result_power = np.power(result, 2) return ( mask, result_human, disequilibrium_mean_by_cluster(mask, result_power).sum() ) def entropy(count_matrix): def divide_line(a, divider): ''' Sub function used for dividing matrix by a vector line by line. ''' return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0) dividers = count_matrix.sum(axis=1) matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers) result_matrix = -1 * matrix_divided * np.log2(matrix_divided, where=count_matrix != 0) result = result_matrix.sum(axis=1) * dividers / dividers.sum() result = result.sum() return (result_matrix, result) ```