Commit 9bb5ff657bf803e1ce5a403f9998e700bf3a3f72

Authored by quillotm
1 parent 78b39d22dd
Exists in master

Adding n argument to pred_distribution_wt_sel

Adding some comments

Showing 1 changed file with 71 additions and 20 deletions Inline Diff

1 1
2 import argparse 2 import argparse
3 3
4 import os 4 import os
5 import core.data 5 import core.data
6 import math 6 import math
7 import numpy as np 7 import numpy as np
8 import scipy.stats 8 import scipy.stats
9 import pickle 9 import pickle
10 import matplotlib.pyplot as plt 10 import matplotlib.pyplot as plt
11 import matplotlib.colors as mcolors 11 import matplotlib.colors as mcolors
12 from utils import SubCommandRunner 12 from utils import SubCommandRunner
13
14
15 from cycler import cycler 13 from cycler import cycler
16 14
15
17 def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str): 16 def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str):
17 '''
18 Distribution of the prediction.
18 19
20 For each label, we plot the distribution of the class predicted.
21 For example, for each character, we plot the distribution of the characters predicted.
22 Another example, for each speaker, we plot the distribution of the characters predicted.
23
24 '''
19 predictions = core.data.read_id_values(args.predictions, float) 25 predictions = core.data.read_id_values(args.predictions, float)
20 labels = core.data.read_labels(args.labels) 26 labels = core.data.read_labels(args.labels)
21 27
22 le = None 28 le = None
23 with open(args.labelencoder, "rb") as f: 29 with open(args.labelencoder, "rb") as f:
24 le = pickle.load(f) 30 le = pickle.load(f)
25 stats = {} 31 stats = {}
26 32
27 print("PREDICTIONS ---------------------------") 33 print("PREDICTIONS ---------------------------")
28 for id_, predictions_ in predictions.items(): 34 for id_, predictions_ in predictions.items():
29 label = labels[id_][0] 35 label = labels[id_][0]
30 if label not in stats: 36 if label not in stats:
31 stats[label] = { 37 stats[label] = {
32 "nb_utt": 1, 38 "nb_utt": 1,
33 "predictions": np.expand_dims(predictions_, axis=0) 39 "predictions": np.expand_dims(predictions_, axis=0)
34 } 40 }
35 else: 41 else:
36 stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1 42 stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1
37 stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0) 43 stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0)
38
39 44
40 print("CALCULATING ---------------------------")
41
42
43 colors = [ 45 colors = [
44 "darkorange", 46 "darkorange",
45 "red", 47 "red",
46 "blue" 48 "blue"
47 ] 49 ]
48 custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) * 50 custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
49 cycler(linestyle=['-', '--', '-.'])) 51 cycler(linestyle=['-', '--', '-.']))
50 52
53 print("CALCULATING ---------------------------")
51 54
52 for label, stats_ in stats.items(): 55 for label, stats_ in stats.items():
53 56
54 plt.gca().set_prop_cycle(custom_cycler) 57 plt.gca().set_prop_cycle(custom_cycler)
55 stats_mean = np.mean(stats_["predictions"], axis=0) 58 stats_mean = np.mean(stats_["predictions"], axis=0)
56 stats_std = np.std(stats_["predictions"], axis=0) 59 stats_std = np.std(stats_["predictions"], axis=0)
57 60
58 #print(label) 61 #print(label)
59 #print(stats_mean) 62 #print(stats_mean)
60 #print(stats_std) 63 #print(stats_std)
61 kwargs = dict(alpha=0.5) 64 kwargs = dict(alpha=0.5)
62 65
63 for i in range(stats_["predictions"].shape[1]): 66 for i in range(stats_["predictions"].shape[1]):
64 label_str = le.inverse_transform([i])[0] 67 label_str = le.inverse_transform([i])[0]
65 #plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs) 68 #plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs)
66 mu = stats_mean[i] 69 mu = stats_mean[i]
67 variance = stats_std[i] * stats_std[i] 70 variance = stats_std[i] * stats_std[i]
68 sigma = stats_std[i] 71 sigma = stats_std[i]
69 # math.sqrt(variance)
70 print(f"{i}: mu {mu}, var {variance}, sigma {sigma}") 72 print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")
71 73
72 #x_values = np.arange(-1, 5, 0.1) 74 #x_values = np.arange(-1, 5, 0.1)
73 75
74 #y_values = scipy.stats.norm(mu, variance) 76 #y_values = scipy.stats.norm(mu, variance)
75 #y = scipy.stats.norm.pdf(x,mean,std) 77 #y = scipy.stats.norm.pdf(x,mean,std)
76 78
77 #plt.plot(x_values, y_values.pdf(x_values,)) 79 #plt.plot(x_values, y_values.pdf(x_values,))
78 80
79 #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True) 81 #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True)
80 x = np.linspace(0, 1, 1000) 82 x = np.linspace(0, 1, 1000)
81 #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000) 83 #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000)
82 #x, step = np.linspace(0, 1, 1000, retstep=True) 84 #x, step = np.linspace(0, 1, 1000, retstep=True)
83 85
84 P = scipy.stats.norm.cdf(x, mu, sigma) 86 P = scipy.stats.norm.cdf(x, mu, sigma)
85 #print(step) 87 #print(step)
86 plt.plot(x, P, label=label_str, **kwargs) 88 plt.plot(x, P, label=label_str, **kwargs)
87 #plt.savefig("simple_gaussian.pdf") 89 #plt.savefig("simple_gaussian.pdf")
88 90
89 plt.legend() 91 plt.legend()
90 plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf")) 92 plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf"))
91 plt.clf() 93 plt.clf()
92 94
93 print("Decisions") 95 print("Decisions")
94 96
95 97
96 def pred_distribution_wt_sel(predictions: str, labels: str, labelencoder: str, outdir: str): 98 def pred_distribution_wt_sel(predictions: str, n: int, labels: str, labelencoder: str, outdir: str):
97 99
100 '''
101 Distribution of the predictions with selection process.
102
103 1) For each dimension, select the n individus with the maximum values for the focused dimension.
104 We name S_i the set of n selected individus for the dimension i.
105 2) For each subset S_i, we plot the distribution of each dimension.
106 '''
107
108 le = None
109 with open(args.labelencoder, "rb") as f:
110 le = pickle.load(f)
111
98 keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions) 112 keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions)
99 n = 3 113
100 print(matrix_preds.shape) 114 colors = [
115 "darkorange",
116 "red",
117 "blue"
118 ]
119 custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
120 cycler(linestyle=['-', '--', '-.']))
121
122 kwargs = dict(alpha=0.5)
123
124 stats_of = open(os.path.join(args.outdir, f"stats.txt"), "w")
101 for j in range(matrix_preds.shape[1]): 125 for j in range(matrix_preds.shape[1]):
126
127 label_focused = le.inverse_transform([j])[0]
102 indices = (-matrix_preds[:, j]).argsort()[:n] 128 indices = (-matrix_preds[:, j]).argsort()[:n]
103 print(f"INDICE: {j}") 129
104 print("indices") 130 print(f"LABEL: {label_focused}", file=stats_of)
105 print(indices) 131 print(f"INDICE: {j}", file=stats_of)
106 print("Best values") 132 print("indices", file=stats_of)
107 print(matrix_preds[indices, j]) 133 print(indices, file=stats_of)
108 print("All dimensions of best values") 134 print("Best values", file=stats_of)
109 print(matrix_preds[indices]) 135 print(matrix_preds[indices, j], file=stats_of)
110 # Select the n best for each column 136 print("All dimensions of best values", file=stats_of)
137 print(matrix_preds[indices], file=stats_of)
138
139 # Use it to build a plot.
140 pred_ = matrix_preds[indices]
141 stats_mean = np.mean(pred_, axis=0)
142 stats_std = np.std(pred_, axis=0)
143 for i in range(matrix_preds.shape[1]):
144 label_str = le.inverse_transform([i])[0]
145 mu = stats_mean[i]
146 variance = stats_std[i] * stats_std[i]
147 sigma = stats_std[i]
148
149 print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")
150
151 x = np.linspace(0, 1, 1000)
152
153 P = scipy.stats.norm.cdf(x, mu, sigma)
154 plt.plot(x, P, label=label_str, **kwargs)
155
156 plt.legend()
157 plt.savefig(os.path.join(args.outdir, f"{label_focused}_prediction_cdf.pdf"))
158 plt.clf()
159 stats_of.close()
111 pass 160 pass
112 161
113 162
114 def utt2dur(utt2dur: str, labels: str): 163 def utt2dur(utt2dur: str, labels: str):
115 if labels == None: 164 if labels == None:
116 pass 165 pass
117 else: 166 else:
118 pass 167 pass
119 168
120 durations = [] 169 durations = []
121 with open(utt2dur, "r") as f: 170 with open(utt2dur, "r") as f:
122 for line in f: 171 for line in f:
123 splited = line.replace("\n", "").split(" ") 172 splited = line.replace("\n", "").split(" ")
124 durations.append(float(splited[1])) 173 durations.append(float(splited[1]))
125 174
126 durations = np.asarray(durations, dtype=float) 175 durations = np.asarray(durations, dtype=float)
127 print(durations.shape) 176 print(durations.shape)
128 mean = np.mean(durations) 177 mean = np.mean(durations)
129 std = np.std(durations) 178 std = np.std(durations)
130 179
131 print(f"mean: {mean}") 180 print(f"mean: {mean}")
132 print(f"std: {std}") 181 print(f"std: {std}")
133 182
134 183
135 if __name__ == "__main__": 184 if __name__ == "__main__":
136 185
137 # Parser 186 # Parser
138 parser = argparse.ArgumentParser(description="Statistics") 187 parser = argparse.ArgumentParser(description="Statistics")
139 subparsers = parser.add_subparsers(title="actions") 188 subparsers = parser.add_subparsers(title="actions")
140 189
141 # pred-distribution 190 # pred-distribution
142 parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels") 191 parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels")
143 parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True) 192 parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True)
144 parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True) 193 parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True)
145 parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True) 194 parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
146 parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True) 195 parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True)
147 parser_pred_dist.set_defaults(which="pred_distribution") 196 parser_pred_dist.set_defaults(which="pred_distribution")
148 197
149 # pred-distribution-with-selection 198 # pred-distribution-with-selection
150 parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.") 199 parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.")
151 parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True) 200 parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True)
201 parser_pred_dist_wt_sel.add_argument("-n", type=int, help="Number of maximum selected for each prediction y_i.")
152 parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True) 202 parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True)
153 parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True) 203 parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
154 parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True) 204 parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True)
155 parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection") 205 parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection")
206
156 # duration-stats 207 # duration-stats
157 parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur") 208 parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur")
158 parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True) 209 parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True)
159 parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file") 210 parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file")
160 parser_utt2dur.set_defaults(which="utt2dur") 211 parser_utt2dur.set_defaults(which="utt2dur")
161 212
162 # Parse 213 # Parse
163 args = parser.parse_args() 214 args = parser.parse_args()
164 215
165 # Run commands 216 # Run commands