stats.py 7.85 KB
import argparse

import os
import core.data
import math
import numpy as np
import scipy.stats
import pickle
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from utils import SubCommandRunner
from cycler import cycler


def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str):
    '''
    Distribution of the prediction.

    For each label, we plot the distribution of the class predicted.
    For example, for each character, we plot the distribution of the characters predicted.
    Another example, for each speaker, we plot the distribution of the characters predicted.

    '''
    predictions = core.data.read_id_values(args.predictions, float)
    labels = core.data.read_labels(args.labels)

    le = None
    with open(args.labelencoder, "rb") as f:
        le = pickle.load(f)
    stats = {}

    print("PREDICTIONS ---------------------------")
    for id_, predictions_ in predictions.items():
        label = labels[id_][0]
        if label not in stats:
            stats[label] = {
                "nb_utt": 1,
                "predictions": np.expand_dims(predictions_, axis=0)
            }
        else:
            stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1
            stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0)

    colors = [
        "darkorange",
        "red",
        "blue"
    ]
    custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
        cycler(linestyle=['-', '--', '-.']))

    print("CALCULATING ---------------------------")
    
    for label, stats_ in stats.items():

        plt.gca().set_prop_cycle(custom_cycler)
        stats_mean = np.mean(stats_["predictions"], axis=0)
        stats_std = np.std(stats_["predictions"], axis=0)

        #print(label)
        #print(stats_mean)
        #print(stats_std)
        kwargs = dict(alpha=0.5)
        
        for i in range(stats_["predictions"].shape[1]):
            label_str = le.inverse_transform([i])[0]
            #plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs)
            mu = stats_mean[i]
            variance = stats_std[i] * stats_std[i]
            sigma = stats_std[i]
            print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")

            #x_values = np.arange(-1, 5, 0.1)

            #y_values = scipy.stats.norm(mu, variance)
            #y = scipy.stats.norm.pdf(x,mean,std)

            #plt.plot(x_values, y_values.pdf(x_values,))

            #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True)
            x = np.linspace(0, 1, 1000)
            #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000)
            #x, step = np.linspace(0, 1, 1000, retstep=True)
            
            P = scipy.stats.norm.cdf(x, mu, sigma)
            #print(step)
            plt.plot(x, P, label=label_str, **kwargs)
            #plt.savefig("simple_gaussian.pdf")
            
        plt.legend()
        plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf"))
        plt.clf()
    
    print("Decisions")


def pred_distribution_wt_sel(predictions: str, n: int, labels: str, labelencoder: str, outdir: str):

    '''
    Distribution of the predictions with selection process.

    1) For each dimension, select the n individus with the maximum values for the focused dimension.
    We name S_i the set of n selected individus for the dimension i.
    2) For each subset S_i, we plot the distribution of each dimension.
    '''

    le = None
    with open(args.labelencoder, "rb") as f:
        le = pickle.load(f)

    keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions)

    colors = [
        "darkorange",
        "red",
        "blue"
    ]
    custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
        cycler(linestyle=['-', '--', '-.']))

    kwargs = dict(alpha=0.5)

    stats_of = open(os.path.join(args.outdir, f"stats.txt"), "w")
    for j in range(matrix_preds.shape[1]):

        label_focused = le.inverse_transform([j])[0]
        indices = (-matrix_preds[:, j]).argsort()[:n]

        print(f"LABEL: {label_focused}", file=stats_of)
        print(f"INDICE: {j}", file=stats_of)
        print("indices", file=stats_of)
        print(indices, file=stats_of)
        print("Best values", file=stats_of)
        print(matrix_preds[indices, j], file=stats_of)
        print("All dimensions of best values", file=stats_of)
        print(matrix_preds[indices], file=stats_of)

        # Use it to build a plot.
        pred_ = matrix_preds[indices]
        stats_mean = np.mean(pred_, axis=0)
        stats_std = np.std(pred_, axis=0)
        for i in range(matrix_preds.shape[1]):
            label_str = le.inverse_transform([i])[0]
            mu = stats_mean[i]
            variance = stats_std[i] * stats_std[i]
            sigma = stats_std[i]

            print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")

            x = np.linspace(0, 1, 1000)

            P = scipy.stats.norm.cdf(x, mu, sigma)
            plt.plot(x, P, label=label_str, **kwargs)

        plt.legend()
        plt.savefig(os.path.join(args.outdir, f"{label_focused}_prediction_cdf.pdf"))
        plt.clf()
    stats_of.close()
    pass


def utt2dur(utt2dur: str, labels: str):
    if labels == None:
        pass
    else:
        pass

    durations = []
    with open(utt2dur, "r") as f:
        for line in f:
            splited = line.replace("\n", "").split(" ")
            durations.append(float(splited[1]))
    
    durations = np.asarray(durations, dtype=float)
    print(durations.shape)
    mean = np.mean(durations)
    std = np.std(durations)

    print(f"mean: {mean}")
    print(f"std: {std}")


if __name__ == "__main__":

    # Parser
    parser = argparse.ArgumentParser(description="Statistics")
    subparsers = parser.add_subparsers(title="actions")

    # pred-distribution
    parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels")
    parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True)
    parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True)
    parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
    parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True)
    parser_pred_dist.set_defaults(which="pred_distribution")

    # pred-distribution-with-selection
    parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.")
    parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True)
    parser_pred_dist_wt_sel.add_argument("-n", type=int, help="Number of maximum selected for each prediction y_i.", required=True)
    parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True)
    parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
    parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True)
    parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection")

    # duration-stats
    parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur")
    parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True)
    parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file")
    parser_utt2dur.set_defaults(which="utt2dur")

    # Parse
    args = parser.parse_args()

    # Run commands
    runner = SubCommandRunner({
        "pred_distribution": pred_distribution,
        "pred_distribution_with_selection": pred_distribution_wt_sel,
        "utt2dur": utt2dur
    })

    runner.run(args.which, args.__dict__, remove="which")