Quillot Mathias / volia

Browse Code »

Commit 765b51bc7741c15001b3983ef6df4d68eedbcd62

Authored by Quillot Mathias 2021-06-24 10:49:54 +0200

1 parent d27fe6fcc5

Exists in master

Little modification to synchronize

Showing 2 changed files with 51 additions and 2 deletions Inline Diff

volia/core/data.py
volia/stats.py

volia/core/data.py

Diff comments View file @ 765b51b

 '''
 Data management input/output
 '''
 # Import packages and modules
 import numpy as np
 import sys
 # Defining some types
-from typing import List, Dict
+from typing import List, Dict, Tuple
+from numpy.lib.shape_base import expand_dims
 KeyToList = Dict[str, List[str]]
 KeyToLabels = Dict[str, List[str]]
 KeyToIntLabels = Dict[str, List[int]]
 KeyToFeatures = Dict[str, List[float]]
 def read_lst(file_path: str) -> KeyToList:
     '''
     Read lst file with this structure:
     [id_1]
     [id_2]
     ...
     [id_n]
     Return a list of ids.
     '''
     lst = []
     with open(file_path, "r") as f:
         for line in f:
             lst.append(line.replace("\n", ""))
     return lst
 def read_id_values(file_path: str, value_type=str):
     '''
     Read file where each line is an id with its corresponding values:
     [id_1] [value_1_1] [value_1_2] ... [value_1_k]
     [id_2] [value_2_1] [value_2_2] ... [value_2_k]
     ...
     [id_n] [value_n_1] [value_n_2] ... [value_n_k]
     where values are value_type type.
     Used in many reader functions with specific value_type.
     Return a dictionary with id as key and values as associated values.
     '''
     id_values = {}
     with open(file_path, "r") as f:
         for line in f:
             splited = line.replace("\n", "").split(" ")
             id_values[splited[0]] = np.asarray(splited[1:], dtype=value_type)
     return id_values
 def read_features(file_path: str) -> KeyToFeatures:
     '''
     Read features files with the following structure:
     [id_1] [value_1_1] [value_1_2] ... [value_1_k]
     [id_2] [value_2_1] [value_2_2] ... [value_2_k]
     ...
     [id_n] [value_n_1] [value_n_2] ... [value_n_k]
     where values are float
     Returns a dictionary with id as key and a list of values as associated values
     '''
     return read_id_values(file_path, np.float64)
+def read_features_with_matrix(file_path: str) -> Tuple[List[str], np.ndarray]:
+    """Read a features file and returns the keys (utterances ids)
+    with the corresponding matrix of values.
+    Args:
+        file_path (str): path of the features file
+    Returns:
+        [Tuple(List[str], np.ndarray)]: a tuple with a list of keys and the matrix
+    """
+    data = read_id_values(file_path, np.float64)
+    keys = []
+    matrix = None
+    for key, values in data.items():
+        keys.append(key)
+        if matrix is None:
+            matrix = np.expand_dims(values, axis=0)
+        matrix = np.append(matrix, np.expand_dims(values, axis=0), axis=0)
+    return (keys, matrix)
 def read_labels(file_path: str) -> KeyToLabels:
     '''
     Read features files with the following structure :
     [id_1] [value_1_1] [value_1_2] ... [value_1_k]
     [id_2] [value_2_1] [value_2_2] ... [value_2_k]
     ...
     [id_n] [value_n_1] [value_n_2] ... [value_n_k]
     where values are int
     '''
     return read_id_values(file_path, str)
 def read_labels_integer(file_path: str) -> KeyToIntLabels:
     '''
     Read features files with the following structure :
     [id_1] [value_1_1] [value_1_2] ... [value_1_k]
     [id_2] [value_2_1] [value_2_2] ... [value_2_k]
     ...
     [id_n] [value_n_1] [value_n_2] ... [value_n_k]
     where values are int
     '''
     return read_id_values(file_path, int)
 def write_line(id_, values=[], out=sys.stdout):
     """
     Write a line in list, labels or features files.
     If you want to write a list, specify an empty
     array for *values*.
     Args:
         id_ (str): id in string.
         values (list, optional): list of values to write, features or labels. Defaults to [].
         out (_io.TextIOWrapper, optional): . Defaults to sys.stdout.
     """
     if len(values) == 0:
         out.write(str(id_) + "\n")
     else:
         out.write(str(id_) + " " + " ".join(values) + "\n")

volia/stats.py

Diff comments View file @ 765b51b

 import argparse
 import os
 import core.data
 import math
 import numpy as np
 import scipy.stats
 import pickle
 import matplotlib.pyplot as plt
 import matplotlib.colors as mcolors
 from utils import SubCommandRunner
 from cycler import cycler
 def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str):
     predictions = core.data.read_id_values(args.predictions, float)
     labels = core.data.read_labels(args.labels)
     le = None
     with open(args.labelencoder, "rb") as f:
         le = pickle.load(f)
     stats = {}
     print("PREDICTIONS ---------------------------")
     for id_, predictions_ in predictions.items():
         label = labels[id_][0]
         if label not in stats:
             stats[label] = {
                 "nb_utt": 1,
                 "predictions": np.expand_dims(predictions_, axis=0)
             }
         else:
             stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1
             stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0)
     print("CALCULATING ---------------------------")
     colors = [
         "darkorange",
         "red",
         "blue"
     ]
     custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
         cycler(linestyle=['-', '--', '-.']))
     for label, stats_ in stats.items():
         plt.gca().set_prop_cycle(custom_cycler)
         stats_mean = np.mean(stats_["predictions"], axis=0)
         stats_std = np.std(stats_["predictions"], axis=0)
         #print(label)
         #print(stats_mean)
         #print(stats_std)
         kwargs = dict(alpha=0.5)
         for i in range(stats_["predictions"].shape[1]):
             label_str = le.inverse_transform([i])[0]
             #plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs)
             mu = stats_mean[i]
             variance = stats_std[i] * stats_std[i]
             sigma = stats_std[i]
             # math.sqrt(variance)
             print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")
             #x_values = np.arange(-1, 5, 0.1)
             #y_values = scipy.stats.norm(mu, variance)
             #y = scipy.stats.norm.pdf(x,mean,std)
             #plt.plot(x_values, y_values.pdf(x_values,))
             #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True)
             x = np.linspace(0, 1, 1000)
             #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000)
             #x, step = np.linspace(0, 1, 1000, retstep=True)
             P = scipy.stats.norm.cdf(x, mu, sigma)
             #print(step)
             plt.plot(x, P, label=label_str, **kwargs)
             #plt.savefig("simple_gaussian.pdf")
         plt.legend()
         plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf"))
         plt.clf()
     print("Decisions")
+def pred_distribution_wt_sel(predictions: str, labels: str, labelencoder: str, outdir: str):
+    keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions)
+    n = 3
+    print(matrix_preds.shape)
+    for j in range(matrix_preds.shape[1]):
+        indices = (-matrix_preds[:, j]).argsort()[:n]
+        print(f"INDICE: {j}")
+        print("indices")
+        print(indices)
+        print("Best values")
+        print(matrix_preds[indices, j])
+        print("All dimensions of best values")
+        print(matrix_preds[indices])
+    # Select the n best for each column
+    pass
 def utt2dur(utt2dur: str, labels: str):
     if labels == None:
         pass
     else:
         pass
     durations = []
     with open(utt2dur, "r") as f:
         for line in f:
             splited = line.replace("\n", "").split(" ")
             durations.append(float(splited[1]))
     durations = np.asarray(durations, dtype=float)
     print(durations.shape)
     mean = np.mean(durations)
     std = np.std(durations)
     print(f"mean: {mean}")
     print(f"std: {std}")
 if __name__ == "__main__":
     # Parser
     parser = argparse.ArgumentParser(description="Statistics")
     subparsers = parser.add_subparsers(title="actions")
     # pred-distribution
     parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels")
     parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True)
     parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True)
     parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
     parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True)
     parser_pred_dist.set_defaults(which="pred_distribution")
+    # pred-distribution-with-selection
+    parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.")
+    parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True)
+    parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True)
+    parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
+    parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True)
+    parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection")
     # duration-stats
     parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur")
     parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True)
     parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file")
     parser_utt2dur.set_defaults(which="utt2dur")
     # Parse
     args = parser.parse_args()
     # Run commands
     runner = SubCommandRunner({
-        "pred-distribution": pred_distribution,
+        "pred_distribution": pred_distribution,
+        "pred_distribution_with_selection": pred_distribution_wt_sel,
         "utt2dur": utt2dur
     })
     runner.run(args.which, args.__dict__, remove="which")