From 9bb5ff657bf803e1ce5a403f9998e700bf3a3f72 Mon Sep 17 00:00:00 2001 From: quillotm Date: Wed, 30 Jun 2021 10:19:00 +0200 Subject: [PATCH] Adding n argument to pred_distribution_wt_sel Adding some comments --- volia/stats.py | 91 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 71 insertions(+), 20 deletions(-) diff --git a/volia/stats.py b/volia/stats.py index c22a75e..d46cd11 100644 --- a/volia/stats.py +++ b/volia/stats.py @@ -10,12 +10,18 @@ import pickle import matplotlib.pyplot as plt import matplotlib.colors as mcolors from utils import SubCommandRunner - - from cycler import cycler + def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str): + ''' + Distribution of the prediction. + For each label, we plot the distribution of the class predicted. + For example, for each character, we plot the distribution of the characters predicted. + Another example, for each speaker, we plot the distribution of the characters predicted. + + ''' predictions = core.data.read_id_values(args.predictions, float) labels = core.data.read_labels(args.labels) @@ -35,10 +41,6 @@ def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: else: stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1 stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0) - - - print("CALCULATING ---------------------------") - colors = [ "darkorange", @@ -48,13 +50,14 @@ def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) * cycler(linestyle=['-', '--', '-.'])) + print("CALCULATING ---------------------------") for label, stats_ in stats.items(): plt.gca().set_prop_cycle(custom_cycler) stats_mean = np.mean(stats_["predictions"], axis=0) stats_std = np.std(stats_["predictions"], axis=0) - + #print(label) #print(stats_mean) #print(stats_std) @@ -66,7 +69,6 @@ def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: mu = stats_mean[i] variance = stats_std[i] * stats_std[i] sigma = stats_std[i] - # math.sqrt(variance) print(f"{i}: mu {mu}, var {variance}, sigma {sigma}") #x_values = np.arange(-1, 5, 0.1) @@ -75,7 +77,7 @@ def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: #y = scipy.stats.norm.pdf(x,mean,std) #plt.plot(x_values, y_values.pdf(x_values,)) - + #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True) x = np.linspace(0, 1, 1000) #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000) @@ -93,21 +95,68 @@ def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: print("Decisions") -def pred_distribution_wt_sel(predictions: str, labels: str, labelencoder: str, outdir: str): +def pred_distribution_wt_sel(predictions: str, n: int, labels: str, labelencoder: str, outdir: str): + + ''' + Distribution of the predictions with selection process. + + 1) For each dimension, select the n individus with the maximum values for the focused dimension. + We name S_i the set of n selected individus for the dimension i. + 2) For each subset S_i, we plot the distribution of each dimension. + ''' + + le = None + with open(args.labelencoder, "rb") as f: + le = pickle.load(f) keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions) - n = 3 - print(matrix_preds.shape) + + colors = [ + "darkorange", + "red", + "blue" + ] + custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) * + cycler(linestyle=['-', '--', '-.'])) + + kwargs = dict(alpha=0.5) + + stats_of = open(os.path.join(args.outdir, f"stats.txt"), "w") for j in range(matrix_preds.shape[1]): + + label_focused = le.inverse_transform([j])[0] indices = (-matrix_preds[:, j]).argsort()[:n] - print(f"INDICE: {j}") - print("indices") - print(indices) - print("Best values") - print(matrix_preds[indices, j]) - print("All dimensions of best values") - print(matrix_preds[indices]) - # Select the n best for each column + + print(f"LABEL: {label_focused}", file=stats_of) + print(f"INDICE: {j}", file=stats_of) + print("indices", file=stats_of) + print(indices, file=stats_of) + print("Best values", file=stats_of) + print(matrix_preds[indices, j], file=stats_of) + print("All dimensions of best values", file=stats_of) + print(matrix_preds[indices], file=stats_of) + + # Use it to build a plot. + pred_ = matrix_preds[indices] + stats_mean = np.mean(pred_, axis=0) + stats_std = np.std(pred_, axis=0) + for i in range(matrix_preds.shape[1]): + label_str = le.inverse_transform([i])[0] + mu = stats_mean[i] + variance = stats_std[i] * stats_std[i] + sigma = stats_std[i] + + print(f"{i}: mu {mu}, var {variance}, sigma {sigma}") + + x = np.linspace(0, 1, 1000) + + P = scipy.stats.norm.cdf(x, mu, sigma) + plt.plot(x, P, label=label_str, **kwargs) + + plt.legend() + plt.savefig(os.path.join(args.outdir, f"{label_focused}_prediction_cdf.pdf")) + plt.clf() + stats_of.close() pass @@ -149,10 +198,12 @@ if __name__ == "__main__": # pred-distribution-with-selection parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.") parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True) + parser_pred_dist_wt_sel.add_argument("-n", type=int, help="Number of maximum selected for each prediction y_i.") parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True) parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True) parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True) parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection") + # duration-stats parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur") parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True) -- 1.8.2.3