Blame view

bin/measures.py 3.21 KB
60d1f63cd   Mathias Quillot   Script and lib th...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
  '''
  This module is a part of my library. 
  It aims to compute some measures for clustering.
  '''
  
  import numpy as np
  
  def disequilibrium_(matrix1, matrix2, isGlobal=False, mod=None):
      '''
      Compute disequilibrium for all the clusters.
      The disequilibrium is compute from the difference
      between two clustering sets.
      isGlobal permet à l'utilisateur de choisir le dénominateur de
      la fonction : 
          - True : divise la valeur par le nombre d'élément du cluster
          - False : divise la valeur par le nombre d'élément total
  
      withPower permet à l'utilisateur de décider d'appliquer un carré 2 ou
      une valeur absolue.
      '''
  
      def divide_line(a, divider):
          '''
          Sub function used for dividing matrix by a vector line by line.
          '''
          return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
  
      dividers1 = 0
      dividers2 = 0
  
      if isGlobal:
          dividers1 = matrix1.sum()
          dividers2 = matrix2.sum()
      else:
          dividers1 = matrix1.sum(axis=1)
          dividers2 = matrix2.sum(axis=1)
      
      matrix1_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix1, dtype=np.float), dividers1)
      
      matrix2_divided = np.apply_along_axis(divide_line, 0, np.asarray(matrix2, dtype=np.float), dividers2)
      
      diff = matrix1_divided - matrix2_divided
      
6da8f6ca7   Mathias Quillot   Repair error from...
44
      mask = np.logical_not(np.logical_and(matrix2==0, matrix1==0))
60d1f63cd   Mathias Quillot   Script and lib th...
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
      result = diff
  
      if mod != None or mod == "":
          for word in mod.split(" "):
              if word == "power":
                  result = np.power(result,2)
              elif word == "human":
                  result = result * 100
              elif word == "abs":
                  result = np.absolute(result)    
              else:
                  raise Exception("Need to specify an accepted mod of the disequilibrium (\"power\", \"human\" or \"abs\"")
      return (mask, result)
  
  
  
  def disequilibrium_mean_by_cluster(mask, matrix):
      '''
      Mean of disequilibrium
      matrix is the disequilibrium calculated
      from number of occurences belonging to a class,
      for each cluster. 
      '''
      nb_k = len(matrix)
      results = np.zeros((nb_k))
6da8f6ca7   Mathias Quillot   Repair error from...
70
      
60d1f63cd   Mathias Quillot   Script and lib th...
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
      for i in range(nb_k):
          results[i] = matrix[i].sum() / mask[i].sum()
      return results
  
  
  def disequilibrium(matrix1, matrix2, isGlobal=False):
      '''
      Disequilibrium matrix
      And Disequilibrium value
      '''
      mask, result = disequilibrium_(matrix1, matrix2, isGlobal)
      result_human = result * 100
      result_power = np.power(result, 2)
  
      return (
          mask,
          result_human,
0b3071878   Mathias Quillot   normalize the glo...
88
          disequilibrium_mean_by_cluster(mask, result_power).sum()/matrix1.shape[0]
60d1f63cd   Mathias Quillot   Script and lib th...
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
      )
  
  
  def entropy(count_matrix):
      def divide_line(a, divider):
          '''
          Sub function used for dividing matrix by a vector line by line.
          '''
          return np.divide(a, divider, out=np.zeros_like(a), where=divider!=0)
  
      dividers = count_matrix.sum(axis=1)
  
      matrix_divided = np.apply_along_axis(divide_line, 0, np.asarray(count_matrix, dtype=np.float), dividers)
      
      result_matrix = -1 * matrix_divided * np.log2(matrix_divided, where=count_matrix != 0)
      result = result_matrix.sum(axis=1) * dividers / dividers.sum()
      result = result.sum()
0b3071878   Mathias Quillot   normalize the glo...
106
      return (result_matrix, result)