Commit 1f8612ebfd7fe8173f5e7f5374192182a1064da3

Authored by Mathias
1 parent adbca3b1ce
Exists in master

repaired memory error due to np.log2 behaviour

Showing 1 changed file with 167 additions and 0 deletions Side-by-side Diff

  1 +'''
  2 +This module is a part of my library.
  3 +It aims to compute some measures for clustering.
  4 +'''
  5 +
  6 +import numpy as np
  7 +
  8 +def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
  9 + '''
  10 + Compute disequilibrium for all the clusters.
  11 + The disequilibrium is compute from the difference
  12 + between two clustering sets.
  13 + isGlobal permet à l'utilisateur de choisir le dénominateur de
  14 + la fonction :
  15 + - True : divise la valeur par le nombre d'élément du cluster
  16 + - False : divise la valeur par le nombre d'élément total
  17 +
  18 + withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
  19 + une valeur absolue.
  20 + '''
  21 +
  22 + def divide_line(a, divider):
  23 + '''
  24 + Sub function used for dividing matrix by a vector line by line.
  25 + '''
  26 + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
  27 +
  28 + dividers1 = 0
  29 + dividers2 = 0
  30 +
  31 + if isGlobal:
  32 + dividers1 = matrix1.sum()
  33 + dividers2 = matrix2.sum()
  34 + else:
  35 + dividers1 = matrix1.sum(axis=1)
  36 + dividers2 = matrix2.sum(axis=1)
  37 +
  38 + matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
  39 +
  40 + matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
  41 +
  42 + diff = matrix1_divided - matrix2_divided
  43 +
  44 + mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
  45 +
  46 + result = diff
  47 +
  48 + if mod != None or mod == "":
  49 + for word in mod.split(" "):
  50 + if word == "power":
  51 + result = np.power(result,2)
  52 + elif word == "human":
  53 + result = result * 100
  54 + elif word == "abs":
  55 + result = np.absolute(result)
  56 + else:
  57 + raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
  58 + return (mask, result)
  59 +
  60 +
  61 +
  62 +def disequilibrium_mean_by_cluster(mask, matrix):
  63 + '''
  64 + Mean of disequilibrium
  65 + matrix is the disequilibrium calculated
  66 + from number of occurences belonging to a class,
  67 + for each cluster.
  68 + '''
  69 + nb_k = len(matrix)
  70 + results = np.zeros((nb_k))
  71 +
  72 + for i in range(nb_k):
  73 + results[i] = matrix[i].sum() / mask[i].sum()
  74 + return results
  75 +
  76 +
  77 +def disequilibrium(matrix1, matrix2, isGlobal=False):
  78 + '''
  79 + Disequilibrium matrix
  80 + And Disequilibrium value
  81 + '''
  82 + mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
  83 + result_human = result * 100
  84 + result_power = np.power(result, 2)
  85 +
  86 + return (
  87 + mask,
  88 + result_human,
  89 + disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
  90 + )
  91 +
  92 +
  93 +def compute_count_matrix(y_hat, y_truth):
  94 + '''
  95 + Check the size of the lists with assertion
  96 + '''
  97 + # Check size of the lists
  98 + assert len(y_hat) == len(y_truth), f"Matrices should have the same length y_hat: {len(y_hat)}, y_truth: {len(y_truth)}"
  99 +
  100 + # Build count matrix
  101 + count_matrix = np.zeros((max(y_hat+1), max(y_truth+1)))
  102 + for i in range(len(y_hat)):
  103 + count_matrix[y_hat[i]][y_truth[i]] += 1
  104 + return count_matrix
  105 +
  106 +
  107 +def entropy_score(y_truth, y_hat):
  108 + '''
  109 + Need to use label encoder before givin y_hat and y_truth
  110 + Don't use one hot labels
  111 +
  112 + Return a tuple with:
  113 + - result_matrix : the matrix with the log multiplied probabilities (P(x) * log(P(x)))
  114 + - result_vector : the vector avec summing entropy of each class. Each value corresponds to a cluster.
  115 + - result : the final entropy measure of the clustering
  116 + '''
  117 + def divide_line(a, divider):
  118 + '''
  119 + Sub function used for dividing matrix by a vector line by line.
  120 + '''
  121 + return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
  122 +
  123 + # Build count matrix
  124 + count_matrix = compute_count_matrix(y_hat, y_truth)
  125 +
  126 + # Build dividers vector
  127 + dividers = count_matrix.sum(axis=1)
  128 +
  129 + matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
  130 +
  131 + log_matrix = np.zeros(matrix_divided.shape)
  132 + np.log2(matrix_divided, out=log_matrix, where=count_matrix != 0)
  133 + result_matrix = -1 * np.multiply(matrix_divided, log_matrix)
  134 + result_vector = result_matrix.sum(axis=1)
  135 + result_vector.sum()
  136 +
  137 + if np.isnan(np.sum(result_vector)):
  138 + print("COUNT MATRIX")
  139 + print(count_matrix)
  140 + print("MATRIX DIVIDED")
  141 + print(matrix_divided)
  142 + print("RESULT MATRIX")
  143 + print(result_matrix)
  144 + print("VECTOR MATRIX")
  145 + print(result_vector)
  146 + print("An error occured due to nan value, some values are printed before")
  147 + exit(1)
  148 +
  149 + result = result_vector * dividers / dividers.sum()
  150 + result = result.sum()
  151 + return (result_matrix, result_vector, result)
  152 +
  153 +
  154 +
  155 +if __name__ == "__main__":
  156 + # Hypothesis
  157 + y_hat = np.asarray([0, 1, 2, 0, 1, 0, 3, 2, 2, 3, 3, 0])
  158 + # Truth
  159 + y = np.asarray([0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3])
  160 +
  161 + (result_matrix, result_vector, result) = entropy(y, y_hat)
  162 +
  163 + print("Result matrix: ")
  164 + print(result_matrix)
  165 + print("Result vector: ")
  166 + print(result_vector)
  167 + print("Result: ", result)