From 9bb5ff657bf803e1ce5a403f9998e700bf3a3f72 Mon Sep 17 00:00:00 2001
From: quillotm <mathias.quillot@univ-avignon.fr>
Date: Wed, 30 Jun 2021 10:19:00 +0200
Subject: [PATCH] Adding n argument to pred_distribution_wt_sel Adding some
 comments

---
 volia/stats.py | 91 +++++++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 71 insertions(+), 20 deletions(-)

diff --git a/volia/stats.py b/volia/stats.py
index c22a75e..d46cd11 100644
--- a/volia/stats.py
+++ b/volia/stats.py
@@ -10,12 +10,18 @@ import pickle
 import matplotlib.pyplot as plt
 import matplotlib.colors as mcolors
 from utils import SubCommandRunner
-
-
 from cycler import cycler
 
+
 def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str):
+    '''
+    Distribution of the prediction.
 
+    For each label, we plot the distribution of the class predicted.
+    For example, for each character, we plot the distribution of the characters predicted.
+    Another example, for each speaker, we plot the distribution of the characters predicted.
+
+    '''
     predictions = core.data.read_id_values(args.predictions, float)
     labels = core.data.read_labels(args.labels)
 
@@ -35,10 +41,6 @@ def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir:
         else:
             stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1
             stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0)
-    
-
-    print("CALCULATING ---------------------------")
-    
 
     colors = [
         "darkorange",
@@ -48,13 +50,14 @@ def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir:
     custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
         cycler(linestyle=['-', '--', '-.']))
 
+    print("CALCULATING ---------------------------")
     
     for label, stats_ in stats.items():
 
         plt.gca().set_prop_cycle(custom_cycler)
         stats_mean = np.mean(stats_["predictions"], axis=0)
         stats_std = np.std(stats_["predictions"], axis=0)
-        
+
         #print(label)
         #print(stats_mean)
         #print(stats_std)
@@ -66,7 +69,6 @@ def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir:
             mu = stats_mean[i]
             variance = stats_std[i] * stats_std[i]
             sigma = stats_std[i]
-            # math.sqrt(variance)
             print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")
 
             #x_values = np.arange(-1, 5, 0.1)
@@ -75,7 +77,7 @@ def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir:
             #y = scipy.stats.norm.pdf(x,mean,std)
 
             #plt.plot(x_values, y_values.pdf(x_values,))
-            
+
             #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True)
             x = np.linspace(0, 1, 1000)
             #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000)
@@ -93,21 +95,68 @@ def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir:
     print("Decisions")
 
 
-def pred_distribution_wt_sel(predictions: str, labels: str, labelencoder: str, outdir: str):
+def pred_distribution_wt_sel(predictions: str, n: int, labels: str, labelencoder: str, outdir: str):
+
+    '''
+    Distribution of the predictions with selection process.
+
+    1) For each dimension, select the n individus with the maximum values for the focused dimension.
+    We name S_i the set of n selected individus for the dimension i.
+    2) For each subset S_i, we plot the distribution of each dimension.
+    '''
+
+    le = None
+    with open(args.labelencoder, "rb") as f:
+        le = pickle.load(f)
 
     keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions)
-    n = 3
-    print(matrix_preds.shape)
+
+    colors = [
+        "darkorange",
+        "red",
+        "blue"
+    ]
+    custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
+        cycler(linestyle=['-', '--', '-.']))
+
+    kwargs = dict(alpha=0.5)
+
+    stats_of = open(os.path.join(args.outdir, f"stats.txt"), "w")
     for j in range(matrix_preds.shape[1]):
+
+        label_focused = le.inverse_transform([j])[0]
         indices = (-matrix_preds[:, j]).argsort()[:n]
-        print(f"INDICE: {j}")
-        print("indices")
-        print(indices)
-        print("Best values")
-        print(matrix_preds[indices, j])
-        print("All dimensions of best values")
-        print(matrix_preds[indices])
-    # Select the n best for each column
+
+        print(f"LABEL: {label_focused}", file=stats_of)
+        print(f"INDICE: {j}", file=stats_of)
+        print("indices", file=stats_of)
+        print(indices, file=stats_of)
+        print("Best values", file=stats_of)
+        print(matrix_preds[indices, j], file=stats_of)
+        print("All dimensions of best values", file=stats_of)
+        print(matrix_preds[indices], file=stats_of)
+
+        # Use it to build a plot.
+        pred_ = matrix_preds[indices]
+        stats_mean = np.mean(pred_, axis=0)
+        stats_std = np.std(pred_, axis=0)
+        for i in range(matrix_preds.shape[1]):
+            label_str = le.inverse_transform([i])[0]
+            mu = stats_mean[i]
+            variance = stats_std[i] * stats_std[i]
+            sigma = stats_std[i]
+
+            print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")
+
+            x = np.linspace(0, 1, 1000)
+
+            P = scipy.stats.norm.cdf(x, mu, sigma)
+            plt.plot(x, P, label=label_str, **kwargs)
+
+        plt.legend()
+        plt.savefig(os.path.join(args.outdir, f"{label_focused}_prediction_cdf.pdf"))
+        plt.clf()
+    stats_of.close()
     pass
 
 
@@ -149,10 +198,12 @@ if __name__ == "__main__":
     # pred-distribution-with-selection
     parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.")
     parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True)
+    parser_pred_dist_wt_sel.add_argument("-n", type=int, help="Number of maximum selected for each prediction y_i.")
     parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True)
     parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
     parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True)
     parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection")
+
     # duration-stats
     parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur")
     parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True)
-- 
1.8.2.3