Commit 4ed3ebc7d7a764a8ea50a8a9c58d4edb83799c99

Authored by Mathias
1 parent 1f8612ebfd
Exists in master

Save results on a csv file now

Showing 1 changed file with 131 additions and 56 deletions Side-by-side Diff

scripts/evaluations/clustering.py
... ... @@ -8,66 +8,28 @@
8 8 import os
9 9 import time
10 10 import pickle
  11 +import csv
  12 +
11 13 from sklearn.preprocessing import LabelEncoder
12 14 from sklearn.metrics.pairwise import pairwise_distances
13   -from sklearn.metrics import f1_score
14 15 from sklearn.cluster import KMeans
15 16 from sklearn.manifold import TSNE
  17 +from sklearn.metrics import f1_score, homogeneity_score, completeness_score, v_measure_score
16 18 import matplotlib.pyplot as plt
17 19  
18 20 from volia.data_io import read_features,read_lst
  21 +from volia.measures import entropy_score
19 22  
20   -if __name__ == "__main__":
21   - # Argparse
22   - parser = argparse.ArgumentParser("Compute clustering on a latent space")
23   - parser.add_argument("features")
24   - parser.add_argument("utt2",
25   - type=str,
26   - help="file with [utt] [value]")
27   - parser.add_argument("--idsfrom",
28   - type=str,
29   - default="utt2",
30   - choices=[
31   - "features",
32   - "utt2"
33   - ],
34   - help="from features or from utt2?")
35   - parser.add_argument("--prefix",
36   - default="",
37   - type=str,
38   - help="prefix of saved files")
39   - parser.add_argument("--outdir",
40   - default=None,
41   - type=str,
42   - help="Output directory")
43   -
44   - args = parser.parse_args()
  23 +'''
  24 +TODO:
  25 +- Add an option allowing the user to choose the number of
  26 +clustering to train in order to compute the average and the
  27 +'''
45 28  
46   - assert args.outdir
47 29  
48   - start = time.time()
  30 +def train_clustering(label_encoder, feats, classes, outdir):
  31 + num_classes = len(label_encoder.classes_)
49 32  
50   - # Load features and utt2
51   - features = read_features(args.features)
52   - utt2 = read_lst(args.utt2)
53   -
54   - # Take id list
55   - if args.idsfrom == "features":
56   - ids = list(features.keys())
57   - elif args.idsfrom == "utt2":
58   - ids = list(utt2.keys())
59   - else:
60   - print(f"idsfrom is not good: {args.idsfrom}")
61   - exit(1)
62   -
63   - feats = np.vstack([ features[id_] for id_ in ids ])
64   - classes = [ utt2[id_] for id_ in ids ]
65   -
66   - # Encode labels
67   - le = LabelEncoder()
68   - labels = le.fit_transform(classes)
69   - num_classes = len(le.classes_)
70   -
71 33 # Compute KMEANS clustering on data
72 34 estimator = KMeans(
73 35 n_clusters=num_classes,
... ... @@ -78,7 +40,7 @@
78 40 estimator.fit(feats)
79 41 print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}")
80 42  
81   - with open(os.path.join(args.outdir, "kmeans.pkl"), "wb") as f:
  43 + with open(os.path.join(outdir, f"_kmeans.pkl"), "wb") as f:
82 44 pickle.dump(estimator, f)
83 45  
84 46 # contains distance to each cluster for each sample
85 47  
86 48  
... ... @@ -102,12 +64,28 @@
102 64 # F-measure
103 65 fscores = f1_score(labels, predicted_labels, average=None)
104 66 fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores))))
105   - print(f"F1-scores for each classes:\n{fscores_str}")
106   - print(f"Global score : {np.mean(fscores)}")
107   - with open(os.path.join(args.outdir, args.prefix + "eval_clustering.log"), "w") as fd:
  67 +
  68 + # Entropy
  69 + _, _, entropy = entropy_score(labels, predicted_labels)
  70 +
  71 + # Homogenity
  72 + homogeneity = homogeneity_score(labels, predicted_labels)
  73 +
  74 + # Completeness
  75 + completeness = completeness_score(labels, predicted_labels)
  76 +
  77 + # V-Measure
  78 + v_measure = v_measure_score(labels, predicted_labels)
  79 +
  80 + # Write results
  81 + with open(os.path.join(outdir, f"_" + args.prefix + "eval_clustering.log"), "w") as fd:
108 82 print(f"F1-scores for each classes:\n{fscores_str}", file=fd)
  83 + print(f"Entropy: {entropy}", file=fd)
109 84 print(f"Global score : {np.mean(fscores)}", file=fd)
110   -
  85 + print(f"Homogeneity: {homogeneity}", file=fd)
  86 + print(f"completeness: {completeness}", file=fd)
  87 + print(f"v-measure: {v_measure}", file=fd)
  88 +
111 89 # Process t-SNE and plot
112 90 tsne_estimator = TSNE()
113 91 embeddings = tsne_estimator.fit_transform(feats)
... ... @@ -135,7 +113,7 @@
135 113 plt.suptitle("Kmeans Clustering")
136 114  
137 115 loc = os.path.join(
138   - args.outdir,
  116 + outdir,
139 117 args.prefix + "kmeans.pdf"
140 118 )
141 119 plt.savefig(loc, bbox_inches="tight")
... ... @@ -145,4 +123,101 @@
145 123  
146 124 end = time.time()
147 125 print("program ended in {0:.2f} seconds".format(end-start))
  126 + return {
  127 + "f1": np.mean(fscores),
  128 + "entropy": entropy,
  129 + "homogeneity": homogeneity,
  130 + "completeness": completeness,
  131 + "v-measure": v_measure
  132 + }
  133 +
  134 +if __name__ == "__main__":
  135 + # Argparse
  136 + parser = argparse.ArgumentParser("Compute clustering on a latent space")
  137 + parser.add_argument("features")
  138 + parser.add_argument("utt2",
  139 + type=str,
  140 + help="file with [utt] [value]")
  141 + parser.add_argument("--idsfrom",
  142 + type=str,
  143 + default="utt2",
  144 + choices=[
  145 + "features",
  146 + "utt2"
  147 + ],
  148 + help="from features or from utt2?")
  149 + parser.add_argument("--prefix",
  150 + default="",
  151 + type=str,
  152 + help="prefix of saved files")
  153 + parser.add_argument("--outdir",
  154 + default=None,
  155 + type=str,
  156 + help="Output directory")
  157 + parser.add_argument("--nmodels",
  158 + type=int,
  159 + default=1,
  160 + help="specifies the number of models to train")
  161 + args = parser.parse_args()
  162 +
  163 + assert args.outdir
  164 +
  165 + start = time.time()
  166 +
  167 + # Load features and utt2
  168 + features = read_features(args.features)
  169 + utt2 = read_lst(args.utt2)
  170 +
  171 + # Take id list
  172 + if args.idsfrom == "features":
  173 + ids = list(features.keys())
  174 + elif args.idsfrom == "utt2":
  175 + ids = list(utt2.keys())
  176 + else:
  177 + print(f"idsfrom is not good: {args.idsfrom}")
  178 + exit(1)
  179 +
  180 + feats = np.vstack([ features[id_] for id_ in ids ])
  181 + classes = [ utt2[id_] for id_ in ids ]
  182 +
  183 + # Encode labels
  184 + le = LabelEncoder()
  185 + labels = le.fit_transform(classes)
  186 +
  187 + measures = {}
  188 + for i in range(1, args.nmodels+1):
  189 + subdir = os.path.join(args.outdir, str(i))
  190 + if not os.path.exists(subdir):
  191 + os.mkdir(subdir)
  192 + print(f"[{i}/{args.nmodels}] => {subdir}")
  193 + results = train_clustering(le, feats, classes, subdir)
  194 +
  195 + for key, value in results.items():
  196 + if key not in measures:
  197 + measures[key] = []
  198 + measures[key].append(results[key])
  199 +
  200 +
  201 + # File with results
  202 + file_results = os.path.join(args.outdir, "clustering_measures.txt")
  203 +
  204 + with open(file_results, "w") as f:
  205 + f.write(f"[nmodels: {args.nmodels}]\n")
  206 + for key in measures.keys():
  207 + values = np.asarray(measures[key], dtype=float)
  208 + mean = np.mean(values)
  209 + std = np.std(values)
  210 + f.write(f"[{key} => mean: {mean}, std: {std}] \n")
  211 +
  212 + # CSV File with all the values
  213 + file_csv_measures = os.path.join(args.outdir, "clustering_measures.csv")
  214 +
  215 + with open(file_csv_measures, "w", newline="") as f:
  216 + writer = csv.writer(f, delimiter=",")
  217 + writer.writerow(["measure"] + list(range(1, args.nmodels+1)) + ["mean"] + ["std"])
  218 + for key in measures.keys():
  219 + values = np.asarray(measures[key], dtype=float)
  220 + mean = np.mean(values)
  221 + std = np.std(values)
  222 + writer.writerow([key] + list(values) + [mean] + [std])