Blame view

bin/measure_clustering.py 5.06 KB
60d1f63cd   Mathias Quillot   Script and lib th...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
  '''
  Compute some measures from clustering like
  disequilibrium and gini.
  '''
  # TODO: Juste disequilibrium par personnage pour commencer.
  
  import argparse
  from data import read_file, index_by_id
  import numpy as np
  from sklearn import preprocessing
  from measures import disequilibrium, entropy
  from sklearn import metrics
  import json
  
  # -- ARGPARSE
  parser = argparse.ArgumentParser(description="Compute metrics from clustering")
  parser.add_argument("clustering", type=str,
                      help="clustering file")
  parser.add_argument("classlst", type=str,
                      help="List used for its classes.")
  parser.add_argument("trainlst", type=str,
                      help="train list")
  parser.add_argument("vallst", type=str,
                      help="val lst")
  parser.add_argument("--outfile", type=str, default="out.out",
                      help="output file path")
  
  args = parser.parse_args()
  CLUSTERING = args.clustering
  CLASS_LST = args.classlst
  TRAIN_LST = args.trainlst
  VAL_LST = args.vallst
  OUTFILE = args.outfile
  
  
  # -- READ FILES
  clustering = read_file(CLUSTERING)
  clustering_ind = index_by_id(clustering)
  
  class_lst = read_file(CLASS_LST)
  class_lst_ind = index_by_id(class_lst)
  
  train_lst = read_file(TRAIN_LST)
  val_lst = read_file(VAL_LST)
  
  # -- GET CLASSES AND CLUSTERS
  train_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in train_lst])
  train_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in train_lst], dtype=np.int)
  
  val_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in val_lst])
  val_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in val_lst], dtype=np.int)
  
  unique, count = np.unique(train_clusters, return_counts=True)
  train_cluster_ind = dict(zip(unique, count))
  
  unique, count = np.unique(val_clusters, return_counts=True)
  val_cluster_ind = dict(zip(unique, count))
  
  
  #print(np.unique(train_classes, return_counts=True))
  
  #sub = np.extract(train_clusters == 1, train_classes)
  #print(np.unique(sub, return_counts=True))
  
  
  
  
  
  def generate_count_matrix(classes1, clusters1, classes2, clusters2):
      '''
      Generate matrices for which sets. 
      Lines are clusters and columns are classes. 
      A cell is contains the number of character occurence 
      on a specific cluster.
      '''
  
      # Index Classes
      classe1_unique = np.unique(classes1)
      classe2_unique = np.unique(classes2)
      all_classes = np.unique(np.concatenate((classe1_unique, classe2_unique)))
      
      # Label Encoder for classes
      le = preprocessing.LabelEncoder()
      le.fit(all_classes)
  
      # Index
      cluster1_unique = np.unique(clusters1)
      cluster2_unique = np.unique(clusters2)
  
      all_clusters = np.unique(np.concatenate((cluster1_unique, cluster2_unique)))
      
      # Create matrix lin(clust) col(class)
      counts_matrix1 = np.zeros((len(all_clusters), len(all_classes)))
      counts_matrix2 = np.zeros((len(all_clusters), len(all_classes)))
  
      for cluster in all_clusters:
          
          # Il faut d'abord extraire les classes présentes dans ce cluster
          cc1 = np.extract(np.asarray(clusters1) == cluster, np.asarray(classes1))
          cc2 = np.extract(np.asarray(clusters2) == cluster, np.asarray(classes2))
  
          cc1_unique, cc1_counts = np.unique(cc1, return_counts=True)
          cc1_ind = dict(zip(cc1_unique, cc1_counts))
  
          cc2_unique, cc2_counts = np.unique(cc2, return_counts=True)
          cc2_ind = dict(zip(cc2_unique, cc2_counts))
  
          for class_ in all_classes:
              class_id = le.transform([class_])[0]
              if class_ in cc1_ind:
                  counts_matrix1[int(cluster)][int(class_id)] = cc1_ind[class_]
              if class_ in cc2_ind:
                  counts_matrix2[int(cluster)][int(class_id)] = cc2_ind[class_]
      return (counts_matrix1, counts_matrix2)
  
  
  train_vscore = metrics.cluster.v_measure_score(train_classes, train_clusters)
  val_vscore = metrics.cluster.v_measure_score(val_classes, val_clusters)
  
  train_homogeneity = metrics.homogeneity_score(train_classes, train_clusters)  
  val_homogeneity = homogeneity_val = metrics.homogeneity_score(val_classes, val_clusters)  
  train_completeness = metrics.completeness_score(train_classes, train_clusters) 
  val_completeness = metrics.completeness_score(val_classes, val_clusters)
  
  counts_matrix1, counts_matrix2 = generate_count_matrix(train_classes, train_clusters, val_classes, val_clusters)
  mask, dis_human, dis_measures = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False)
  
  (train_entropy_matrix, train_entropy) = entropy(counts_matrix1)
  (val_entropy_matrix, val_entropy) = entropy(counts_matrix2)
  
  results = {}
  results["train"] = {}
  results["train"]["vscore"] = train_vscore
  results["train"]["homogeneity"] = train_homogeneity
  results["train"]["completeness"] = val_completeness
  
  results["val"] = {}
  results["val"]["vscore"] = val_vscore
  results["val"]["homogeneity"] = val_homogeneity
  results["val"]["completeness"] = val_completeness
  
  results["disequilibrium"] = dis_measures
  
  #results = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False, mod="pow")
  with open(OUTFILE, "w") as f:
      json_content = json.dumps(results)
      f.write(json_content)