stats.py
3.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import argparse
import os
import core.data
import math
import numpy as np
import scipy.stats
import pickle
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from cycler import cycler
def stats():
print("Decisions")
print(list(mcolors.TABLEAU_COLORS))
if __name__ == "__main__":
# Parser
parser = argparse.ArgumentParser(description="")
# Arguments
parser.add_argument("--predictions", type=str, help="prediction file", required=True)
parser.add_argument("--labels", type=str, help="label file", required=True)
parser.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
parser.add_argument("--outdir", type=str, help="output file", required=True)
args = parser.parse_args()
predictions = core.data.read_id_values(args.predictions, float)
labels = core.data.read_labels(args.labels)
le = None
with open(args.labelencoder, "rb") as f:
le = pickle.load(f)
stats = {}
print("PREDICTIONS ---------------------------")
for id_, predictions_ in predictions.items():
label = labels[id_][0]
if label not in stats:
stats[label] = {
"nb_utt": 1,
"predictions": np.expand_dims(predictions_, axis=0)
}
else:
stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1
stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0)
print("CALCULATING ---------------------------")
colors = [
"darkorange",
"red",
"blue"
]
custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
cycler(linestyle=['-', '--', '-.']))
for label, stats_ in stats.items():
plt.gca().set_prop_cycle(custom_cycler)
stats_mean = np.mean(stats_["predictions"], axis=0)
stats_std = np.std(stats_["predictions"], axis=0)
#print(label)
#print(stats_mean)
#print(stats_std)
kwargs = dict(alpha=0.5)
for i in range(stats_["predictions"].shape[1]):
label_str = le.inverse_transform([i])[0]
#plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs)
mu = stats_mean[i]
variance = stats_std[i] * stats_std[i]
sigma = stats_std[i]
# math.sqrt(variance)
print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")
#x_values = np.arange(-1, 5, 0.1)
#y_values = scipy.stats.norm(mu, variance)
#y = scipy.stats.norm.pdf(x,mean,std)
#plt.plot(x_values, y_values.pdf(x_values,))
#x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True)
x = np.linspace(0, 1, 1000)
#x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000)
#x, step = np.linspace(0, 1, 1000, retstep=True)
P = scipy.stats.norm.cdf(x, mu, sigma)
#print(step)
plt.plot(x, P, label=label_str, **kwargs)
#plt.savefig("simple_gaussian.pdf")
plt.legend()
plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf"))
plt.clf()
# TODO:
# One graph for each label. Distribution of their predictions output are displayed.