Blame view

bin/measure_clustering.py 5.37 KB
60d1f63cd   Mathias Quillot   Script and lib th...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
  '''
  Compute some measures from clustering like
  disequilibrium and gini.
  '''
  # TODO: Juste disequilibrium par personnage pour commencer.
  
  import argparse
  from data import read_file, index_by_id
  import numpy as np
  from sklearn import preprocessing
  from measures import disequilibrium, entropy
  from sklearn import metrics
  import json
  
  # -- ARGPARSE
  parser = argparse.ArgumentParser(description="Compute metrics from clustering")
  parser.add_argument("clustering", type=str,
                      help="clustering file")
  parser.add_argument("classlst", type=str,
                      help="List used for its classes.")
  parser.add_argument("trainlst", type=str,
                      help="train list")
  parser.add_argument("vallst", type=str,
                      help="val lst")
  parser.add_argument("--outfile", type=str, default="out.out",
                      help="output file path")
  
  args = parser.parse_args()
  CLUSTERING = args.clustering
  CLASS_LST = args.classlst
  TRAIN_LST = args.trainlst
  VAL_LST = args.vallst
  OUTFILE = args.outfile
  
  
  # -- READ FILES
  clustering = read_file(CLUSTERING)
  clustering_ind = index_by_id(clustering)
  
  class_lst = read_file(CLASS_LST)
  class_lst_ind = index_by_id(class_lst)
  
  train_lst = read_file(TRAIN_LST)
  val_lst = read_file(VAL_LST)
  
  # -- GET CLASSES AND CLUSTERS
  train_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in train_lst])
  train_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in train_lst], dtype=np.int)
933b2505a   Mathias Quillot   Add entropy
49

60d1f63cd   Mathias Quillot   Script and lib th...
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
  val_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in val_lst])
  val_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in val_lst], dtype=np.int)
  
  unique, count = np.unique(train_clusters, return_counts=True)
  train_cluster_ind = dict(zip(unique, count))
  
  unique, count = np.unique(val_clusters, return_counts=True)
  val_cluster_ind = dict(zip(unique, count))
  
  
  #print(np.unique(train_classes, return_counts=True))
  
  #sub = np.extract(train_clusters == 1, train_classes)
  #print(np.unique(sub, return_counts=True))
  
  
  
  
  
  def generate_count_matrix(classes1, clusters1, classes2, clusters2):
      '''
      Generate matrices for which sets. 
      Lines are clusters and columns are classes. 
      A cell is contains the number of character occurence 
      on a specific cluster.
      '''
  
      # Index Classes
      classe1_unique = np.unique(classes1)
      classe2_unique = np.unique(classes2)
      all_classes = np.unique(np.concatenate((classe1_unique, classe2_unique)))
933b2505a   Mathias Quillot   Add entropy
81
         
60d1f63cd   Mathias Quillot   Script and lib th...
82
83
84
85
86
87
88
      # Label Encoder for classes
      le = preprocessing.LabelEncoder()
      le.fit(all_classes)
  
      # Index
      cluster1_unique = np.unique(clusters1)
      cluster2_unique = np.unique(clusters2)
60d1f63cd   Mathias Quillot   Script and lib th...
89
90
      all_clusters = np.unique(np.concatenate((cluster1_unique, cluster2_unique)))
      
933b2505a   Mathias Quillot   Add entropy
91
92
93
      # Warning
      if np.max(all_clusters) != len(cluster1_unique)-1:
          print("WARNING: Some clusters are empty. Value max : " + str(np.max(all_clusters)) + " Nb values : " + str(len(cluster1_unique)))
60d1f63cd   Mathias Quillot   Script and lib th...
94
      # Create matrix lin(clust) col(class)
933b2505a   Mathias Quillot   Add entropy
95
96
      counts_matrix1 = np.zeros((np.max(all_clusters) + 1, len(all_classes)))
      counts_matrix2 = np.zeros((np.max(all_clusters) + 1, len(all_classes)))
60d1f63cd   Mathias Quillot   Script and lib th...
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
  
      for cluster in all_clusters:
          
          # Il faut d'abord extraire les classes présentes dans ce cluster
          cc1 = np.extract(np.asarray(clusters1) == cluster, np.asarray(classes1))
          cc2 = np.extract(np.asarray(clusters2) == cluster, np.asarray(classes2))
  
          cc1_unique, cc1_counts = np.unique(cc1, return_counts=True)
          cc1_ind = dict(zip(cc1_unique, cc1_counts))
  
          cc2_unique, cc2_counts = np.unique(cc2, return_counts=True)
          cc2_ind = dict(zip(cc2_unique, cc2_counts))
  
          for class_ in all_classes:
              class_id = le.transform([class_])[0]
              if class_ in cc1_ind:
                  counts_matrix1[int(cluster)][int(class_id)] = cc1_ind[class_]
              if class_ in cc2_ind:
                  counts_matrix2[int(cluster)][int(class_id)] = cc2_ind[class_]
      return (counts_matrix1, counts_matrix2)
  
  
  train_vscore = metrics.cluster.v_measure_score(train_classes, train_clusters)
  val_vscore = metrics.cluster.v_measure_score(val_classes, val_clusters)
  
  train_homogeneity = metrics.homogeneity_score(train_classes, train_clusters)  
  val_homogeneity = homogeneity_val = metrics.homogeneity_score(val_classes, val_clusters)  
  train_completeness = metrics.completeness_score(train_classes, train_clusters) 
  val_completeness = metrics.completeness_score(val_classes, val_clusters)
  
  counts_matrix1, counts_matrix2 = generate_count_matrix(train_classes, train_clusters, val_classes, val_clusters)
933b2505a   Mathias Quillot   Add entropy
128

60d1f63cd   Mathias Quillot   Script and lib th...
129
  mask, dis_human, dis_measures = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False)
933b2505a   Mathias Quillot   Add entropy
130

60d1f63cd   Mathias Quillot   Script and lib th...
131
132
133
134
135
  (train_entropy_matrix, train_entropy) = entropy(counts_matrix1)
  (val_entropy_matrix, val_entropy) = entropy(counts_matrix2)
  
  results = {}
  results["train"] = {}
933b2505a   Mathias Quillot   Add entropy
136
  results["train"]["entropy"] = train_entropy 
60d1f63cd   Mathias Quillot   Script and lib th...
137
138
139
140
141
  results["train"]["vscore"] = train_vscore
  results["train"]["homogeneity"] = train_homogeneity
  results["train"]["completeness"] = val_completeness
  
  results["val"] = {}
933b2505a   Mathias Quillot   Add entropy
142
  results["val"]["entropy"] = val_entropy 
60d1f63cd   Mathias Quillot   Script and lib th...
143
144
145
146
147
148
149
150
151
152
  results["val"]["vscore"] = val_vscore
  results["val"]["homogeneity"] = val_homogeneity
  results["val"]["completeness"] = val_completeness
  
  results["disequilibrium"] = dis_measures
  
  #results = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False, mod="pow")
  with open(OUTFILE, "w") as f:
      json_content = json.dumps(results)
      f.write(json_content)