Blame view

scripts/evaluations/clustering.py 8.04 KB
e403ed5fb   Mathias   Add a script that...
1
2
3
4
5
6
7
8
9
  '''
  This script allows the user to evaluate a classification system on new labels using clustering methods.
  The algorithms are applied on the given latent space (embedding).
  '''
  import argparse
  import numpy as np
  import pandas as pd
  import os
  import time
adbca3b1c   Mathias   Save the kmeans m...
10
  import pickle
4ed3ebc7d   Mathias   Save results on a...
11
  import csv
e403ed5fb   Mathias   Add a script that...
12
13
  from sklearn.preprocessing import LabelEncoder
  from sklearn.metrics.pairwise import pairwise_distances
e403ed5fb   Mathias   Add a script that...
14
15
  from sklearn.cluster import KMeans
  from sklearn.manifold import TSNE
4ed3ebc7d   Mathias   Save results on a...
16
  from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score
e403ed5fb   Mathias   Add a script that...
17
18
19
  import matplotlib.pyplot as plt
  
  from volia.data_io import read_features,read_lst
15b183a24   Mathias   Add purity measur...
20
  from volia.measures import entropy_score, purity_score
e403ed5fb   Mathias   Add a script that...
21

4ed3ebc7d   Mathias   Save results on a...
22
23
24
25
26
  '''
  TODO: 
  - Add an option allowing the user to choose the number of 
  clustering to train in order to compute the average and the
  '''
e403ed5fb   Mathias   Add a script that...
27

e403ed5fb   Mathias   Add a script that...
28

4ed3ebc7d   Mathias   Save results on a...
29
30
  def train_clustering(label_encoder, feats, classes, outdir):
      num_classes = len(label_encoder.classes_)
e403ed5fb   Mathias   Add a script that...
31
32
33
34
35
36
37
38
39
40
  
      # Compute KMEANS clustering on data
      estimator = KMeans(
          n_clusters=num_classes,
          n_init=100,
          tol=10-6,
          algorithm="elkan"
      )
      estimator.fit(feats)
      print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}")
4ed3ebc7d   Mathias   Save results on a...
41
      with open(os.path.join(outdir, f"_kmeans.pkl"), "wb") as f:
adbca3b1c   Mathias   Save the kmeans m...
42
43
          pickle.dump(estimator, f)
      
e403ed5fb   Mathias   Add a script that...
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
      # contains distance to each cluster for each sample
      dist_space = estimator.transform(feats)
      predictions = np.argmin(dist_space, axis=1)
  
      # gives each cluster a name (considering most represented character)
      dataframe = pd.DataFrame({
          "label": pd.Series(list(map(lambda x: le.classes_[x], labels))),
          "prediction": pd.Series(predictions)
      })
  
      def find_cluster_name_fn(c):
          mask = dataframe["prediction"] == c
          return dataframe[mask]["label"].value_counts(sort=False).idxmax()
      
      cluster_names = list(map(find_cluster_name_fn, range(num_classes)))
      predicted_labels = le.transform(
          [cluster_names[pred] for pred in predictions])
      
      # F-measure
      fscores = f1_score(labels, predicted_labels, average=None)
      fscores_str = "
  ".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores))))
4ed3ebc7d   Mathias   Save results on a...
66
67
68
69
70
71
72
73
74
75
76
77
      
      # Entropy
      _, _, entropy = entropy_score(labels, predicted_labels)
  
      # Homogenity
      homogeneity = homogeneity_score(labels, predicted_labels)
  
      # Completeness
      completeness = completeness_score(labels, predicted_labels)
  
      # V-Measure
      v_measure = v_measure_score(labels, predicted_labels)
15b183a24   Mathias   Add purity measur...
78
79
80
81
82
      # Purity
      purity_scores = purity_score(labels, predicted_labels)
      purity_class_score = purity_scores["purity_class_score"]
      purity_cluster_score = purity_scores["purity_cluster_score"]
      K = purity_scores["K"]
4ed3ebc7d   Mathias   Save results on a...
83
84
      # Write results
      with open(os.path.join(outdir, f"_" + args.prefix + "eval_clustering.log"), "w") as fd:
e403ed5fb   Mathias   Add a script that...
85
86
          print(f"F1-scores for each classes:
  {fscores_str}", file=fd)
4ed3ebc7d   Mathias   Save results on a...
87
          print(f"Entropy: {entropy}", file=fd)
e403ed5fb   Mathias   Add a script that...
88
          print(f"Global score : {np.mean(fscores)}", file=fd)
4ed3ebc7d   Mathias   Save results on a...
89
90
91
          print(f"Homogeneity: {homogeneity}", file=fd)
          print(f"completeness: {completeness}", file=fd)
          print(f"v-measure: {v_measure}", file=fd)
15b183a24   Mathias   Add purity measur...
92
93
94
          print(f"purity class score: {purity_class_score}", file=fd)
          print(f"purity cluster score: {purity_cluster_score}", file=fd)
          print(f"purity overall evaluation criterion (K): {K}", file=fd)
4ed3ebc7d   Mathias   Save results on a...
95

e403ed5fb   Mathias   Add a script that...
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
      # Process t-SNE and plot
      tsne_estimator = TSNE()
      embeddings = tsne_estimator.fit_transform(feats)
      print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format(
          tsne_estimator.n_iter_, tsne_estimator.kl_divergence_))
  
      fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5))
      for c, name in enumerate(le.classes_):
          c_mask = np.where(labels == c)
          axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
  
          try:
              id_cluster = cluster_names.index(name)
          except ValueError:
              print("WARNING: no cluster found for {}".format(name))
              continue
          c_mask = np.where(predictions == id_cluster)
          axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
      
      axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
      axe1.set_title("true labels")
      axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
      axe2.set_title("predicted cluster label")
  
      plt.suptitle("Kmeans Clustering")
  
      loc = os.path.join(
4ed3ebc7d   Mathias   Save results on a...
123
          outdir,
e403ed5fb   Mathias   Add a script that...
124
125
126
127
128
129
130
131
          args.prefix + "kmeans.pdf"
      )
      plt.savefig(loc, bbox_inches="tight")
      plt.close()
  
      print("INFO: figure saved at {}".format(loc))
  
      end = time.time()
4ed3ebc7d   Mathias   Save results on a...
132
133
134
135
136
137
      print("program ended in {0:.2f} seconds".format(end-start))
      return {
          "f1": np.mean(fscores),
          "entropy": entropy,
          "homogeneity": homogeneity,
          "completeness": completeness,
15b183a24   Mathias   Add purity measur...
138
139
140
141
          "v-measure": v_measure,
          "purity_class_score": purity_class_score,
          "purity_cluster score": purity_cluster_score,
          "K": K
4ed3ebc7d   Mathias   Save results on a...
142
      }
15b183a24   Mathias   Add purity measur...
143

4ed3ebc7d   Mathias   Save results on a...
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
  if __name__ == "__main__":
      # Argparse
      parser = argparse.ArgumentParser("Compute clustering on a latent space")
      parser.add_argument("features")
      parser.add_argument("utt2",
                          type=str,
                          help="file with [utt] [value]")
      parser.add_argument("--idsfrom",
                          type=str,
                          default="utt2",
                          choices=[
                              "features",
                              "utt2"
                          ],
                          help="from features or from utt2?")
      parser.add_argument("--prefix",
                          default="",
                          type=str,
                          help="prefix of saved files")
      parser.add_argument("--outdir",
                          default=None,
                          type=str,
                          help="Output directory")
      parser.add_argument("--nmodels",
                          type=int,
                          default=1,
                          help="specifies the number of models to train")   
      args = parser.parse_args()
  
      assert args.outdir
  
      start = time.time()
  
      # Load features and utt2
      features = read_features(args.features)
      utt2 = read_lst(args.utt2)
  
      # Take id list
      if args.idsfrom == "features":
          ids = list(features.keys())
      elif args.idsfrom == "utt2":
          ids = list(utt2.keys())
      else:
          print(f"idsfrom is not good: {args.idsfrom}")
          exit(1)
      
      feats = np.vstack([ features[id_] for id_ in ids ])
      classes = [ utt2[id_] for id_ in ids ]
  
      # Encode labels
      le = LabelEncoder()
      labels = le.fit_transform(classes)
      
      measures = {}
      for i in range(1, args.nmodels+1):
          subdir = os.path.join(args.outdir, str(i))
          if not os.path.exists(subdir):
              os.mkdir(subdir)
          print(f"[{i}/{args.nmodels}] => {subdir}")
          results = train_clustering(le, feats, classes, subdir)
  
          for key, value in results.items():
              if key not in measures:
                  measures[key] = []
              measures[key].append(results[key])
      
  
      # File with results
      file_results = os.path.join(args.outdir, "clustering_measures.txt")
  
      with open(file_results, "w") as f:
          f.write(f"[nmodels: {args.nmodels}]
  ")
          for key in measures.keys():
              values = np.asarray(measures[key], dtype=float)
              mean = np.mean(values)
              std = np.std(values)
              f.write(f"[{key} => mean: {mean}, std: {std}] 
  ")
              
      # CSV File with all the values
      file_csv_measures = os.path.join(args.outdir, "clustering_measures.csv")
  
      with open(file_csv_measures, "w", newline="") as f:
          writer = csv.writer(f, delimiter=",")
          writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"])
          for key in measures.keys():
              values = np.asarray(measures[key], dtype=float)
              mean = np.mean(values)
              std = np.std(values)
              writer.writerow([key] + list(values) + [mean] + [std])