Commit 417942fcd22d566053d2dc13351e93e57d653eb6
1 parent
9bb5ff657b
Exists in
master
Specifying that "n" argument is required to pred_distribution_wt_sel
Showing 1 changed file with 1 additions and 1 deletions Inline Diff
volia/stats.py
1 | 1 | ||
2 | import argparse | 2 | import argparse |
3 | 3 | ||
4 | import os | 4 | import os |
5 | import core.data | 5 | import core.data |
6 | import math | 6 | import math |
7 | import numpy as np | 7 | import numpy as np |
8 | import scipy.stats | 8 | import scipy.stats |
9 | import pickle | 9 | import pickle |
10 | import matplotlib.pyplot as plt | 10 | import matplotlib.pyplot as plt |
11 | import matplotlib.colors as mcolors | 11 | import matplotlib.colors as mcolors |
12 | from utils import SubCommandRunner | 12 | from utils import SubCommandRunner |
13 | from cycler import cycler | 13 | from cycler import cycler |
14 | 14 | ||
15 | 15 | ||
16 | def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str): | 16 | def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str): |
17 | ''' | 17 | ''' |
18 | Distribution of the prediction. | 18 | Distribution of the prediction. |
19 | 19 | ||
20 | For each label, we plot the distribution of the class predicted. | 20 | For each label, we plot the distribution of the class predicted. |
21 | For example, for each character, we plot the distribution of the characters predicted. | 21 | For example, for each character, we plot the distribution of the characters predicted. |
22 | Another example, for each speaker, we plot the distribution of the characters predicted. | 22 | Another example, for each speaker, we plot the distribution of the characters predicted. |
23 | 23 | ||
24 | ''' | 24 | ''' |
25 | predictions = core.data.read_id_values(args.predictions, float) | 25 | predictions = core.data.read_id_values(args.predictions, float) |
26 | labels = core.data.read_labels(args.labels) | 26 | labels = core.data.read_labels(args.labels) |
27 | 27 | ||
28 | le = None | 28 | le = None |
29 | with open(args.labelencoder, "rb") as f: | 29 | with open(args.labelencoder, "rb") as f: |
30 | le = pickle.load(f) | 30 | le = pickle.load(f) |
31 | stats = {} | 31 | stats = {} |
32 | 32 | ||
33 | print("PREDICTIONS ---------------------------") | 33 | print("PREDICTIONS ---------------------------") |
34 | for id_, predictions_ in predictions.items(): | 34 | for id_, predictions_ in predictions.items(): |
35 | label = labels[id_][0] | 35 | label = labels[id_][0] |
36 | if label not in stats: | 36 | if label not in stats: |
37 | stats[label] = { | 37 | stats[label] = { |
38 | "nb_utt": 1, | 38 | "nb_utt": 1, |
39 | "predictions": np.expand_dims(predictions_, axis=0) | 39 | "predictions": np.expand_dims(predictions_, axis=0) |
40 | } | 40 | } |
41 | else: | 41 | else: |
42 | stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1 | 42 | stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1 |
43 | stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0) | 43 | stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0) |
44 | 44 | ||
45 | colors = [ | 45 | colors = [ |
46 | "darkorange", | 46 | "darkorange", |
47 | "red", | 47 | "red", |
48 | "blue" | 48 | "blue" |
49 | ] | 49 | ] |
50 | custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) * | 50 | custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) * |
51 | cycler(linestyle=['-', '--', '-.'])) | 51 | cycler(linestyle=['-', '--', '-.'])) |
52 | 52 | ||
53 | print("CALCULATING ---------------------------") | 53 | print("CALCULATING ---------------------------") |
54 | 54 | ||
55 | for label, stats_ in stats.items(): | 55 | for label, stats_ in stats.items(): |
56 | 56 | ||
57 | plt.gca().set_prop_cycle(custom_cycler) | 57 | plt.gca().set_prop_cycle(custom_cycler) |
58 | stats_mean = np.mean(stats_["predictions"], axis=0) | 58 | stats_mean = np.mean(stats_["predictions"], axis=0) |
59 | stats_std = np.std(stats_["predictions"], axis=0) | 59 | stats_std = np.std(stats_["predictions"], axis=0) |
60 | 60 | ||
61 | #print(label) | 61 | #print(label) |
62 | #print(stats_mean) | 62 | #print(stats_mean) |
63 | #print(stats_std) | 63 | #print(stats_std) |
64 | kwargs = dict(alpha=0.5) | 64 | kwargs = dict(alpha=0.5) |
65 | 65 | ||
66 | for i in range(stats_["predictions"].shape[1]): | 66 | for i in range(stats_["predictions"].shape[1]): |
67 | label_str = le.inverse_transform([i])[0] | 67 | label_str = le.inverse_transform([i])[0] |
68 | #plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs) | 68 | #plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs) |
69 | mu = stats_mean[i] | 69 | mu = stats_mean[i] |
70 | variance = stats_std[i] * stats_std[i] | 70 | variance = stats_std[i] * stats_std[i] |
71 | sigma = stats_std[i] | 71 | sigma = stats_std[i] |
72 | print(f"{i}: mu {mu}, var {variance}, sigma {sigma}") | 72 | print(f"{i}: mu {mu}, var {variance}, sigma {sigma}") |
73 | 73 | ||
74 | #x_values = np.arange(-1, 5, 0.1) | 74 | #x_values = np.arange(-1, 5, 0.1) |
75 | 75 | ||
76 | #y_values = scipy.stats.norm(mu, variance) | 76 | #y_values = scipy.stats.norm(mu, variance) |
77 | #y = scipy.stats.norm.pdf(x,mean,std) | 77 | #y = scipy.stats.norm.pdf(x,mean,std) |
78 | 78 | ||
79 | #plt.plot(x_values, y_values.pdf(x_values,)) | 79 | #plt.plot(x_values, y_values.pdf(x_values,)) |
80 | 80 | ||
81 | #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True) | 81 | #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True) |
82 | x = np.linspace(0, 1, 1000) | 82 | x = np.linspace(0, 1, 1000) |
83 | #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000) | 83 | #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000) |
84 | #x, step = np.linspace(0, 1, 1000, retstep=True) | 84 | #x, step = np.linspace(0, 1, 1000, retstep=True) |
85 | 85 | ||
86 | P = scipy.stats.norm.cdf(x, mu, sigma) | 86 | P = scipy.stats.norm.cdf(x, mu, sigma) |
87 | #print(step) | 87 | #print(step) |
88 | plt.plot(x, P, label=label_str, **kwargs) | 88 | plt.plot(x, P, label=label_str, **kwargs) |
89 | #plt.savefig("simple_gaussian.pdf") | 89 | #plt.savefig("simple_gaussian.pdf") |
90 | 90 | ||
91 | plt.legend() | 91 | plt.legend() |
92 | plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf")) | 92 | plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf")) |
93 | plt.clf() | 93 | plt.clf() |
94 | 94 | ||
95 | print("Decisions") | 95 | print("Decisions") |
96 | 96 | ||
97 | 97 | ||
98 | def pred_distribution_wt_sel(predictions: str, n: int, labels: str, labelencoder: str, outdir: str): | 98 | def pred_distribution_wt_sel(predictions: str, n: int, labels: str, labelencoder: str, outdir: str): |
99 | 99 | ||
100 | ''' | 100 | ''' |
101 | Distribution of the predictions with selection process. | 101 | Distribution of the predictions with selection process. |
102 | 102 | ||
103 | 1) For each dimension, select the n individus with the maximum values for the focused dimension. | 103 | 1) For each dimension, select the n individus with the maximum values for the focused dimension. |
104 | We name S_i the set of n selected individus for the dimension i. | 104 | We name S_i the set of n selected individus for the dimension i. |
105 | 2) For each subset S_i, we plot the distribution of each dimension. | 105 | 2) For each subset S_i, we plot the distribution of each dimension. |
106 | ''' | 106 | ''' |
107 | 107 | ||
108 | le = None | 108 | le = None |
109 | with open(args.labelencoder, "rb") as f: | 109 | with open(args.labelencoder, "rb") as f: |
110 | le = pickle.load(f) | 110 | le = pickle.load(f) |
111 | 111 | ||
112 | keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions) | 112 | keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions) |
113 | 113 | ||
114 | colors = [ | 114 | colors = [ |
115 | "darkorange", | 115 | "darkorange", |
116 | "red", | 116 | "red", |
117 | "blue" | 117 | "blue" |
118 | ] | 118 | ] |
119 | custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) * | 119 | custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) * |
120 | cycler(linestyle=['-', '--', '-.'])) | 120 | cycler(linestyle=['-', '--', '-.'])) |
121 | 121 | ||
122 | kwargs = dict(alpha=0.5) | 122 | kwargs = dict(alpha=0.5) |
123 | 123 | ||
124 | stats_of = open(os.path.join(args.outdir, f"stats.txt"), "w") | 124 | stats_of = open(os.path.join(args.outdir, f"stats.txt"), "w") |
125 | for j in range(matrix_preds.shape[1]): | 125 | for j in range(matrix_preds.shape[1]): |
126 | 126 | ||
127 | label_focused = le.inverse_transform([j])[0] | 127 | label_focused = le.inverse_transform([j])[0] |
128 | indices = (-matrix_preds[:, j]).argsort()[:n] | 128 | indices = (-matrix_preds[:, j]).argsort()[:n] |
129 | 129 | ||
130 | print(f"LABEL: {label_focused}", file=stats_of) | 130 | print(f"LABEL: {label_focused}", file=stats_of) |
131 | print(f"INDICE: {j}", file=stats_of) | 131 | print(f"INDICE: {j}", file=stats_of) |
132 | print("indices", file=stats_of) | 132 | print("indices", file=stats_of) |
133 | print(indices, file=stats_of) | 133 | print(indices, file=stats_of) |
134 | print("Best values", file=stats_of) | 134 | print("Best values", file=stats_of) |
135 | print(matrix_preds[indices, j], file=stats_of) | 135 | print(matrix_preds[indices, j], file=stats_of) |
136 | print("All dimensions of best values", file=stats_of) | 136 | print("All dimensions of best values", file=stats_of) |
137 | print(matrix_preds[indices], file=stats_of) | 137 | print(matrix_preds[indices], file=stats_of) |
138 | 138 | ||
139 | # Use it to build a plot. | 139 | # Use it to build a plot. |
140 | pred_ = matrix_preds[indices] | 140 | pred_ = matrix_preds[indices] |
141 | stats_mean = np.mean(pred_, axis=0) | 141 | stats_mean = np.mean(pred_, axis=0) |
142 | stats_std = np.std(pred_, axis=0) | 142 | stats_std = np.std(pred_, axis=0) |
143 | for i in range(matrix_preds.shape[1]): | 143 | for i in range(matrix_preds.shape[1]): |
144 | label_str = le.inverse_transform([i])[0] | 144 | label_str = le.inverse_transform([i])[0] |
145 | mu = stats_mean[i] | 145 | mu = stats_mean[i] |
146 | variance = stats_std[i] * stats_std[i] | 146 | variance = stats_std[i] * stats_std[i] |
147 | sigma = stats_std[i] | 147 | sigma = stats_std[i] |
148 | 148 | ||
149 | print(f"{i}: mu {mu}, var {variance}, sigma {sigma}") | 149 | print(f"{i}: mu {mu}, var {variance}, sigma {sigma}") |
150 | 150 | ||
151 | x = np.linspace(0, 1, 1000) | 151 | x = np.linspace(0, 1, 1000) |
152 | 152 | ||
153 | P = scipy.stats.norm.cdf(x, mu, sigma) | 153 | P = scipy.stats.norm.cdf(x, mu, sigma) |
154 | plt.plot(x, P, label=label_str, **kwargs) | 154 | plt.plot(x, P, label=label_str, **kwargs) |
155 | 155 | ||
156 | plt.legend() | 156 | plt.legend() |
157 | plt.savefig(os.path.join(args.outdir, f"{label_focused}_prediction_cdf.pdf")) | 157 | plt.savefig(os.path.join(args.outdir, f"{label_focused}_prediction_cdf.pdf")) |
158 | plt.clf() | 158 | plt.clf() |
159 | stats_of.close() | 159 | stats_of.close() |
160 | pass | 160 | pass |
161 | 161 | ||
162 | 162 | ||
163 | def utt2dur(utt2dur: str, labels: str): | 163 | def utt2dur(utt2dur: str, labels: str): |
164 | if labels == None: | 164 | if labels == None: |
165 | pass | 165 | pass |
166 | else: | 166 | else: |
167 | pass | 167 | pass |
168 | 168 | ||
169 | durations = [] | 169 | durations = [] |
170 | with open(utt2dur, "r") as f: | 170 | with open(utt2dur, "r") as f: |
171 | for line in f: | 171 | for line in f: |
172 | splited = line.replace("\n", "").split(" ") | 172 | splited = line.replace("\n", "").split(" ") |
173 | durations.append(float(splited[1])) | 173 | durations.append(float(splited[1])) |
174 | 174 | ||
175 | durations = np.asarray(durations, dtype=float) | 175 | durations = np.asarray(durations, dtype=float) |
176 | print(durations.shape) | 176 | print(durations.shape) |
177 | mean = np.mean(durations) | 177 | mean = np.mean(durations) |
178 | std = np.std(durations) | 178 | std = np.std(durations) |
179 | 179 | ||
180 | print(f"mean: {mean}") | 180 | print(f"mean: {mean}") |
181 | print(f"std: {std}") | 181 | print(f"std: {std}") |
182 | 182 | ||
183 | 183 | ||
184 | if __name__ == "__main__": | 184 | if __name__ == "__main__": |
185 | 185 | ||
186 | # Parser | 186 | # Parser |
187 | parser = argparse.ArgumentParser(description="Statistics") | 187 | parser = argparse.ArgumentParser(description="Statistics") |
188 | subparsers = parser.add_subparsers(title="actions") | 188 | subparsers = parser.add_subparsers(title="actions") |
189 | 189 | ||
190 | # pred-distribution | 190 | # pred-distribution |
191 | parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels") | 191 | parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels") |
192 | parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True) | 192 | parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True) |
193 | parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True) | 193 | parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True) |
194 | parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True) | 194 | parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True) |
195 | parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True) | 195 | parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True) |
196 | parser_pred_dist.set_defaults(which="pred_distribution") | 196 | parser_pred_dist.set_defaults(which="pred_distribution") |
197 | 197 | ||
198 | # pred-distribution-with-selection | 198 | # pred-distribution-with-selection |
199 | parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.") | 199 | parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.") |
200 | parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True) | 200 | parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True) |
201 | parser_pred_dist_wt_sel.add_argument("-n", type=int, help="Number of maximum selected for each prediction y_i.") | 201 | parser_pred_dist_wt_sel.add_argument("-n", type=int, help="Number of maximum selected for each prediction y_i.", required=True) |
202 | parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True) | 202 | parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True) |
203 | parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True) | 203 | parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True) |
204 | parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True) | 204 | parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True) |
205 | parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection") | 205 | parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection") |
206 | 206 | ||
207 | # duration-stats | 207 | # duration-stats |
208 | parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur") | 208 | parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur") |
209 | parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True) | 209 | parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True) |
210 | parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file") | 210 | parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file") |
211 | parser_utt2dur.set_defaults(which="utt2dur") | 211 | parser_utt2dur.set_defaults(which="utt2dur") |
212 | 212 | ||
213 | # Parse | 213 | # Parse |
214 | args = parser.parse_args() | 214 | args = parser.parse_args() |
215 | 215 | ||
216 | # Run commands | 216 | # Run commands |
217 | runner = SubCommandRunner({ | 217 | runner = SubCommandRunner({ |
218 | "pred_distribution": pred_distribution, | 218 | "pred_distribution": pred_distribution, |
219 | "pred_distribution_with_selection": pred_distribution_wt_sel, | 219 | "pred_distribution_with_selection": pred_distribution_wt_sel, |
220 | "utt2dur": utt2dur | 220 | "utt2dur": utt2dur |
221 | }) | 221 | }) |
222 | 222 | ||
223 | runner.run(args.which, args.__dict__, remove="which") | 223 | runner.run(args.which, args.__dict__, remove="which") |