From e403ed5fb6202dae56d47815d5961cced00f1c85 Mon Sep 17 00:00:00 2001
From: Mathias <stillinbedstream@gmail.com>
Date: Mon, 14 Sep 2020 14:40:53 +0200
Subject: [PATCH] Add a script that allow user to evaluate a representation
 using classification labels.

---
 scripts/evaluations/clustering.py | 126 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 scripts/evaluations/clustering.py

diff --git a/scripts/evaluations/clustering.py b/scripts/evaluations/clustering.py
new file mode 100644
index 0000000..60f6286
--- /dev/null
+++ b/scripts/evaluations/clustering.py
@@ -0,0 +1,126 @@
+'''
+This script allows the user to evaluate a classification system on new labels using clustering methods.
+The algorithms are applied on the given latent space (embedding).
+'''
+import argparse
+import numpy as np
+import pandas as pd
+import os
+import time
+from sklearn.preprocessing import LabelEncoder
+from sklearn.metrics.pairwise import pairwise_distances
+from sklearn.metrics import f1_score
+from sklearn.cluster import KMeans
+from sklearn.manifold import TSNE
+import matplotlib.pyplot as plt
+
+from volia.data_io import read_features,read_lst
+
+if __name__ == "__main__":
+    # Argparse
+    parser = argparse.ArgumentParser("Compute clustering on a latent space")
+    parser.add_argument("features")
+    parser.add_argument("utt2",
+                        type=str,
+                        help="file with [utt] [value]")
+    parser.add_argument("--prefix", 
+                        type=str,
+                        help="prefix of saved files")
+    parser.add_argument("--outdir",
+                        default=None,
+                        type=str,
+                        help="Output directory")
+    
+    args = parser.parse_args()
+
+    assert args.outdir
+
+    start = time.time()
+
+    # Load features and utt2
+    features = read_features(args.features)
+    utt2 = read_lst(args.utt2)
+
+    ids = list(features.keys())
+    feats = np.vstack([ features[id_] for id_ in ids ])
+    classes = [ utt2[id_] for id_ in ids ]
+
+    # Encode labels
+    le = LabelEncoder()
+    labels = le.fit_transform(classes)
+    num_classes = len(le.classes_)
+
+    # Compute KMEANS clustering on data
+    estimator = KMeans(
+        n_clusters=num_classes,
+        n_init=100,
+        tol=10-6,
+        algorithm="elkan"
+    )
+    estimator.fit(feats)
+    print(f"Kmeans: processed {estimator.n_iter_} iterations - intertia={estimator.inertia_}")
+
+    # contains distance to each cluster for each sample
+    dist_space = estimator.transform(feats)
+    predictions = np.argmin(dist_space, axis=1)
+
+    # gives each cluster a name (considering most represented character)
+    dataframe = pd.DataFrame({
+        "label": pd.Series(list(map(lambda x: le.classes_[x], labels))),
+        "prediction": pd.Series(predictions)
+    })
+
+    def find_cluster_name_fn(c):
+        mask = dataframe["prediction"] == c
+        return dataframe[mask]["label"].value_counts(sort=False).idxmax()
+    
+    cluster_names = list(map(find_cluster_name_fn, range(num_classes)))
+    predicted_labels = le.transform(
+        [cluster_names[pred] for pred in predictions])
+    
+    # F-measure
+    fscores = f1_score(labels, predicted_labels, average=None)
+    fscores_str = "\n".join(map(lambda i: "{0:25s}: {1:.4f}".format(le.classes_[i], fscores[i]), range(len(fscores))))
+    print(f"F1-scores for each classes:\n{fscores_str}")
+    print(f"Global score : {np.mean(fscores)}")
+    with open(os.path.join(args.outdir, args.prefix + "eval_clustering.log"), "w") as fd:
+        print(f"F1-scores for each classes:\n{fscores_str}", file=fd)
+        print(f"Global score : {np.mean(fscores)}", file=fd)
+    
+    # Process t-SNE and plot
+    tsne_estimator = TSNE()
+    embeddings = tsne_estimator.fit_transform(feats)
+    print("t-SNE: processed {0} iterations - KL_divergence={1:.4f}".format(
+        tsne_estimator.n_iter_, tsne_estimator.kl_divergence_))
+
+    fig, [axe1, axe2] = plt.subplots(1, 2, figsize=(10, 5))
+    for c, name in enumerate(le.classes_):
+        c_mask = np.where(labels == c)
+        axe1.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
+
+        try:
+            id_cluster = cluster_names.index(name)
+        except ValueError:
+            print("WARNING: no cluster found for {}".format(name))
+            continue
+        c_mask = np.where(predictions == id_cluster)
+        axe2.scatter(embeddings[c_mask][:, 0], embeddings[c_mask][:, 1], label=name, alpha=0.2, edgecolors=None)
+    
+    axe1.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
+    axe1.set_title("true labels")
+    axe2.legend(loc="lower center", bbox_to_anchor=(0.5, -0.35))
+    axe2.set_title("predicted cluster label")
+
+    plt.suptitle("Kmeans Clustering")
+
+    loc = os.path.join(
+        args.outdir,
+        args.prefix + "kmeans.pdf"
+    )
+    plt.savefig(loc, bbox_inches="tight")
+    plt.close()
+
+    print("INFO: figure saved at {}".format(loc))
+
+    end = time.time()
+    print("program ended in {0:.2f} seconds".format(end-start))
\ No newline at end of file
-- 
1.8.2.3