Commit 417942fcd22d566053d2dc13351e93e57d653eb6

Authored by quillotm
1 parent 9bb5ff657b
Exists in master

Specifying that "n" argument is required to pred_distribution_wt_sel

Showing 1 changed file with 1 additions and 1 deletions Inline Diff

1 1
2 import argparse 2 import argparse
3 3
4 import os 4 import os
5 import core.data 5 import core.data
6 import math 6 import math
7 import numpy as np 7 import numpy as np
8 import scipy.stats 8 import scipy.stats
9 import pickle 9 import pickle
10 import matplotlib.pyplot as plt 10 import matplotlib.pyplot as plt
11 import matplotlib.colors as mcolors 11 import matplotlib.colors as mcolors
12 from utils import SubCommandRunner 12 from utils import SubCommandRunner
13 from cycler import cycler 13 from cycler import cycler
14 14
15 15
16 def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str): 16 def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str):
17 ''' 17 '''
18 Distribution of the prediction. 18 Distribution of the prediction.
19 19
20 For each label, we plot the distribution of the class predicted. 20 For each label, we plot the distribution of the class predicted.
21 For example, for each character, we plot the distribution of the characters predicted. 21 For example, for each character, we plot the distribution of the characters predicted.
22 Another example, for each speaker, we plot the distribution of the characters predicted. 22 Another example, for each speaker, we plot the distribution of the characters predicted.
23 23
24 ''' 24 '''
25 predictions = core.data.read_id_values(args.predictions, float) 25 predictions = core.data.read_id_values(args.predictions, float)
26 labels = core.data.read_labels(args.labels) 26 labels = core.data.read_labels(args.labels)
27 27
28 le = None 28 le = None
29 with open(args.labelencoder, "rb") as f: 29 with open(args.labelencoder, "rb") as f:
30 le = pickle.load(f) 30 le = pickle.load(f)
31 stats = {} 31 stats = {}
32 32
33 print("PREDICTIONS ---------------------------") 33 print("PREDICTIONS ---------------------------")
34 for id_, predictions_ in predictions.items(): 34 for id_, predictions_ in predictions.items():
35 label = labels[id_][0] 35 label = labels[id_][0]
36 if label not in stats: 36 if label not in stats:
37 stats[label] = { 37 stats[label] = {
38 "nb_utt": 1, 38 "nb_utt": 1,
39 "predictions": np.expand_dims(predictions_, axis=0) 39 "predictions": np.expand_dims(predictions_, axis=0)
40 } 40 }
41 else: 41 else:
42 stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1 42 stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1
43 stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0) 43 stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0)
44 44
45 colors = [ 45 colors = [
46 "darkorange", 46 "darkorange",
47 "red", 47 "red",
48 "blue" 48 "blue"
49 ] 49 ]
50 custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) * 50 custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
51 cycler(linestyle=['-', '--', '-.'])) 51 cycler(linestyle=['-', '--', '-.']))
52 52
53 print("CALCULATING ---------------------------") 53 print("CALCULATING ---------------------------")
54 54
55 for label, stats_ in stats.items(): 55 for label, stats_ in stats.items():
56 56
57 plt.gca().set_prop_cycle(custom_cycler) 57 plt.gca().set_prop_cycle(custom_cycler)
58 stats_mean = np.mean(stats_["predictions"], axis=0) 58 stats_mean = np.mean(stats_["predictions"], axis=0)
59 stats_std = np.std(stats_["predictions"], axis=0) 59 stats_std = np.std(stats_["predictions"], axis=0)
60 60
61 #print(label) 61 #print(label)
62 #print(stats_mean) 62 #print(stats_mean)
63 #print(stats_std) 63 #print(stats_std)
64 kwargs = dict(alpha=0.5) 64 kwargs = dict(alpha=0.5)
65 65
66 for i in range(stats_["predictions"].shape[1]): 66 for i in range(stats_["predictions"].shape[1]):
67 label_str = le.inverse_transform([i])[0] 67 label_str = le.inverse_transform([i])[0]
68 #plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs) 68 #plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs)
69 mu = stats_mean[i] 69 mu = stats_mean[i]
70 variance = stats_std[i] * stats_std[i] 70 variance = stats_std[i] * stats_std[i]
71 sigma = stats_std[i] 71 sigma = stats_std[i]
72 print(f"{i}: mu {mu}, var {variance}, sigma {sigma}") 72 print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")
73 73
74 #x_values = np.arange(-1, 5, 0.1) 74 #x_values = np.arange(-1, 5, 0.1)
75 75
76 #y_values = scipy.stats.norm(mu, variance) 76 #y_values = scipy.stats.norm(mu, variance)
77 #y = scipy.stats.norm.pdf(x,mean,std) 77 #y = scipy.stats.norm.pdf(x,mean,std)
78 78
79 #plt.plot(x_values, y_values.pdf(x_values,)) 79 #plt.plot(x_values, y_values.pdf(x_values,))
80 80
81 #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True) 81 #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True)
82 x = np.linspace(0, 1, 1000) 82 x = np.linspace(0, 1, 1000)
83 #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000) 83 #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000)
84 #x, step = np.linspace(0, 1, 1000, retstep=True) 84 #x, step = np.linspace(0, 1, 1000, retstep=True)
85 85
86 P = scipy.stats.norm.cdf(x, mu, sigma) 86 P = scipy.stats.norm.cdf(x, mu, sigma)
87 #print(step) 87 #print(step)
88 plt.plot(x, P, label=label_str, **kwargs) 88 plt.plot(x, P, label=label_str, **kwargs)
89 #plt.savefig("simple_gaussian.pdf") 89 #plt.savefig("simple_gaussian.pdf")
90 90
91 plt.legend() 91 plt.legend()
92 plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf")) 92 plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf"))
93 plt.clf() 93 plt.clf()
94 94
95 print("Decisions") 95 print("Decisions")
96 96
97 97
98 def pred_distribution_wt_sel(predictions: str, n: int, labels: str, labelencoder: str, outdir: str): 98 def pred_distribution_wt_sel(predictions: str, n: int, labels: str, labelencoder: str, outdir: str):
99 99
100 ''' 100 '''
101 Distribution of the predictions with selection process. 101 Distribution of the predictions with selection process.
102 102
103 1) For each dimension, select the n individus with the maximum values for the focused dimension. 103 1) For each dimension, select the n individus with the maximum values for the focused dimension.
104 We name S_i the set of n selected individus for the dimension i. 104 We name S_i the set of n selected individus for the dimension i.
105 2) For each subset S_i, we plot the distribution of each dimension. 105 2) For each subset S_i, we plot the distribution of each dimension.
106 ''' 106 '''
107 107
108 le = None 108 le = None
109 with open(args.labelencoder, "rb") as f: 109 with open(args.labelencoder, "rb") as f:
110 le = pickle.load(f) 110 le = pickle.load(f)
111 111
112 keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions) 112 keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions)
113 113
114 colors = [ 114 colors = [
115 "darkorange", 115 "darkorange",
116 "red", 116 "red",
117 "blue" 117 "blue"
118 ] 118 ]
119 custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) * 119 custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
120 cycler(linestyle=['-', '--', '-.'])) 120 cycler(linestyle=['-', '--', '-.']))
121 121
122 kwargs = dict(alpha=0.5) 122 kwargs = dict(alpha=0.5)
123 123
124 stats_of = open(os.path.join(args.outdir, f"stats.txt"), "w") 124 stats_of = open(os.path.join(args.outdir, f"stats.txt"), "w")
125 for j in range(matrix_preds.shape[1]): 125 for j in range(matrix_preds.shape[1]):
126 126
127 label_focused = le.inverse_transform([j])[0] 127 label_focused = le.inverse_transform([j])[0]
128 indices = (-matrix_preds[:, j]).argsort()[:n] 128 indices = (-matrix_preds[:, j]).argsort()[:n]
129 129
130 print(f"LABEL: {label_focused}", file=stats_of) 130 print(f"LABEL: {label_focused}", file=stats_of)
131 print(f"INDICE: {j}", file=stats_of) 131 print(f"INDICE: {j}", file=stats_of)
132 print("indices", file=stats_of) 132 print("indices", file=stats_of)
133 print(indices, file=stats_of) 133 print(indices, file=stats_of)
134 print("Best values", file=stats_of) 134 print("Best values", file=stats_of)
135 print(matrix_preds[indices, j], file=stats_of) 135 print(matrix_preds[indices, j], file=stats_of)
136 print("All dimensions of best values", file=stats_of) 136 print("All dimensions of best values", file=stats_of)
137 print(matrix_preds[indices], file=stats_of) 137 print(matrix_preds[indices], file=stats_of)
138 138
139 # Use it to build a plot. 139 # Use it to build a plot.
140 pred_ = matrix_preds[indices] 140 pred_ = matrix_preds[indices]
141 stats_mean = np.mean(pred_, axis=0) 141 stats_mean = np.mean(pred_, axis=0)
142 stats_std = np.std(pred_, axis=0) 142 stats_std = np.std(pred_, axis=0)
143 for i in range(matrix_preds.shape[1]): 143 for i in range(matrix_preds.shape[1]):
144 label_str = le.inverse_transform([i])[0] 144 label_str = le.inverse_transform([i])[0]
145 mu = stats_mean[i] 145 mu = stats_mean[i]
146 variance = stats_std[i] * stats_std[i] 146 variance = stats_std[i] * stats_std[i]
147 sigma = stats_std[i] 147 sigma = stats_std[i]
148 148
149 print(f"{i}: mu {mu}, var {variance}, sigma {sigma}") 149 print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")
150 150
151 x = np.linspace(0, 1, 1000) 151 x = np.linspace(0, 1, 1000)
152 152
153 P = scipy.stats.norm.cdf(x, mu, sigma) 153 P = scipy.stats.norm.cdf(x, mu, sigma)
154 plt.plot(x, P, label=label_str, **kwargs) 154 plt.plot(x, P, label=label_str, **kwargs)
155 155
156 plt.legend() 156 plt.legend()
157 plt.savefig(os.path.join(args.outdir, f"{label_focused}_prediction_cdf.pdf")) 157 plt.savefig(os.path.join(args.outdir, f"{label_focused}_prediction_cdf.pdf"))
158 plt.clf() 158 plt.clf()
159 stats_of.close() 159 stats_of.close()
160 pass 160 pass
161 161
162 162
163 def utt2dur(utt2dur: str, labels: str): 163 def utt2dur(utt2dur: str, labels: str):
164 if labels == None: 164 if labels == None:
165 pass 165 pass
166 else: 166 else:
167 pass 167 pass
168 168
169 durations = [] 169 durations = []
170 with open(utt2dur, "r") as f: 170 with open(utt2dur, "r") as f:
171 for line in f: 171 for line in f:
172 splited = line.replace("\n", "").split(" ") 172 splited = line.replace("\n", "").split(" ")
173 durations.append(float(splited[1])) 173 durations.append(float(splited[1]))
174 174
175 durations = np.asarray(durations, dtype=float) 175 durations = np.asarray(durations, dtype=float)
176 print(durations.shape) 176 print(durations.shape)
177 mean = np.mean(durations) 177 mean = np.mean(durations)
178 std = np.std(durations) 178 std = np.std(durations)
179 179
180 print(f"mean: {mean}") 180 print(f"mean: {mean}")
181 print(f"std: {std}") 181 print(f"std: {std}")
182 182
183 183
184 if __name__ == "__main__": 184 if __name__ == "__main__":
185 185
186 # Parser 186 # Parser
187 parser = argparse.ArgumentParser(description="Statistics") 187 parser = argparse.ArgumentParser(description="Statistics")
188 subparsers = parser.add_subparsers(title="actions") 188 subparsers = parser.add_subparsers(title="actions")
189 189
190 # pred-distribution 190 # pred-distribution
191 parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels") 191 parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels")
192 parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True) 192 parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True)
193 parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True) 193 parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True)
194 parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True) 194 parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
195 parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True) 195 parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True)
196 parser_pred_dist.set_defaults(which="pred_distribution") 196 parser_pred_dist.set_defaults(which="pred_distribution")
197 197
198 # pred-distribution-with-selection 198 # pred-distribution-with-selection
199 parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.") 199 parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.")
200 parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True) 200 parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True)
201 parser_pred_dist_wt_sel.add_argument("-n", type=int, help="Number of maximum selected for each prediction y_i.") 201 parser_pred_dist_wt_sel.add_argument("-n", type=int, help="Number of maximum selected for each prediction y_i.", required=True)
202 parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True) 202 parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True)
203 parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True) 203 parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
204 parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True) 204 parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True)
205 parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection") 205 parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection")
206 206
207 # duration-stats 207 # duration-stats
208 parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur") 208 parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur")
209 parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True) 209 parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True)
210 parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file") 210 parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file")
211 parser_utt2dur.set_defaults(which="utt2dur") 211 parser_utt2dur.set_defaults(which="utt2dur")
212 212
213 # Parse 213 # Parse
214 args = parser.parse_args() 214 args = parser.parse_args()
215 215
216 # Run commands 216 # Run commands
217 runner = SubCommandRunner({ 217 runner = SubCommandRunner({
218 "pred_distribution": pred_distribution, 218 "pred_distribution": pred_distribution,
219 "pred_distribution_with_selection": pred_distribution_wt_sel, 219 "pred_distribution_with_selection": pred_distribution_wt_sel,
220 "utt2dur": utt2dur 220 "utt2dur": utt2dur
221 }) 221 })
222 222
223 runner.run(args.which, args.__dict__, remove="which") 223 runner.run(args.which, args.__dict__, remove="which")