stats.py 5.96 KB
import argparse

import os
import core.data
import math
import numpy as np
import scipy.stats
import pickle
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from utils import SubCommandRunner


from cycler import cycler

def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str):

    predictions = core.data.read_id_values(args.predictions, float)
    labels = core.data.read_labels(args.labels)

    le = None
    with open(args.labelencoder, "rb") as f:
        le = pickle.load(f)
    stats = {}

    print("PREDICTIONS ---------------------------")
    for id_, predictions_ in predictions.items():
        label = labels[id_][0]
        if label not in stats:
            stats[label] = {
                "nb_utt": 1,
                "predictions": np.expand_dims(predictions_, axis=0)
            }
        else:
            stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1
            stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0)
    

    print("CALCULATING ---------------------------")
    

    colors = [
        "darkorange",
        "red",
        "blue"
    ]
    custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
        cycler(linestyle=['-', '--', '-.']))

    
    for label, stats_ in stats.items():

        plt.gca().set_prop_cycle(custom_cycler)
        stats_mean = np.mean(stats_["predictions"], axis=0)
        stats_std = np.std(stats_["predictions"], axis=0)
        
        #print(label)
        #print(stats_mean)
        #print(stats_std)
        kwargs = dict(alpha=0.5)
        
        for i in range(stats_["predictions"].shape[1]):
            label_str = le.inverse_transform([i])[0]
            #plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs)
            mu = stats_mean[i]
            variance = stats_std[i] * stats_std[i]
            sigma = stats_std[i]
            # math.sqrt(variance)
            print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")

            #x_values = np.arange(-1, 5, 0.1)

            #y_values = scipy.stats.norm(mu, variance)
            #y = scipy.stats.norm.pdf(x,mean,std)

            #plt.plot(x_values, y_values.pdf(x_values,))
            
            #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True)
            x = np.linspace(0, 1, 1000)
            #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000)
            #x, step = np.linspace(0, 1, 1000, retstep=True)
            
            P = scipy.stats.norm.cdf(x, mu, sigma)
            #print(step)
            plt.plot(x, P, label=label_str, **kwargs)
            #plt.savefig("simple_gaussian.pdf")
            
        plt.legend()
        plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf"))
        plt.clf()
    
    print("Decisions")


def pred_distribution_wt_sel(predictions: str, labels: str, labelencoder: str, outdir: str):

    keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions)
    n = 3
    print(matrix_preds.shape)
    for j in range(matrix_preds.shape[1]):
        indices = (-matrix_preds[:, j]).argsort()[:n]
        print(f"INDICE: {j}")
        print("indices")
        print(indices)
        print("Best values")
        print(matrix_preds[indices, j])
        print("All dimensions of best values")
        print(matrix_preds[indices])
    # Select the n best for each column
    pass


def utt2dur(utt2dur: str, labels: str):
    if labels == None:
        pass
    else:
        pass

    durations = []
    with open(utt2dur, "r") as f:
        for line in f:
            splited = line.replace("\n", "").split(" ")
            durations.append(float(splited[1]))
    
    durations = np.asarray(durations, dtype=float)
    print(durations.shape)
    mean = np.mean(durations)
    std = np.std(durations)

    print(f"mean: {mean}")
    print(f"std: {std}")


if __name__ == "__main__":

    # Parser
    parser = argparse.ArgumentParser(description="Statistics")
    subparsers = parser.add_subparsers(title="actions")

    # pred-distribution
    parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels")
    parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True)
    parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True)
    parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
    parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True)
    parser_pred_dist.set_defaults(which="pred_distribution")

    # pred-distribution-with-selection
    parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.")
    parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True)
    parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True)
    parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
    parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True)
    parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection")
    # duration-stats
    parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur")
    parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True)
    parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file")
    parser_utt2dur.set_defaults(which="utt2dur")

    # Parse
    args = parser.parse_args()

    # Run commands
    runner = SubCommandRunner({
        "pred_distribution": pred_distribution,
        "pred_distribution_with_selection": pred_distribution_wt_sel,
        "utt2dur": utt2dur
    })

    runner.run(args.which, args.__dict__, remove="which")