Commit 9bb5ff657bf803e1ce5a403f9998e700bf3a3f72
1 parent
78b39d22dd
Exists in
master
Adding n argument to pred_distribution_wt_sel
Adding some comments
Showing 1 changed file with 71 additions and 20 deletions Inline Diff
volia/stats.py
1 | 1 | ||
2 | import argparse | 2 | import argparse |
3 | 3 | ||
4 | import os | 4 | import os |
5 | import core.data | 5 | import core.data |
6 | import math | 6 | import math |
7 | import numpy as np | 7 | import numpy as np |
8 | import scipy.stats | 8 | import scipy.stats |
9 | import pickle | 9 | import pickle |
10 | import matplotlib.pyplot as plt | 10 | import matplotlib.pyplot as plt |
11 | import matplotlib.colors as mcolors | 11 | import matplotlib.colors as mcolors |
12 | from utils import SubCommandRunner | 12 | from utils import SubCommandRunner |
13 | |||
14 | |||
15 | from cycler import cycler | 13 | from cycler import cycler |
16 | 14 | ||
15 | |||
17 | def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str): | 16 | def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str): |
17 | ''' | ||
18 | Distribution of the prediction. | ||
18 | 19 | ||
20 | For each label, we plot the distribution of the class predicted. | ||
21 | For example, for each character, we plot the distribution of the characters predicted. | ||
22 | Another example, for each speaker, we plot the distribution of the characters predicted. | ||
23 | |||
24 | ''' | ||
19 | predictions = core.data.read_id_values(args.predictions, float) | 25 | predictions = core.data.read_id_values(args.predictions, float) |
20 | labels = core.data.read_labels(args.labels) | 26 | labels = core.data.read_labels(args.labels) |
21 | 27 | ||
22 | le = None | 28 | le = None |
23 | with open(args.labelencoder, "rb") as f: | 29 | with open(args.labelencoder, "rb") as f: |
24 | le = pickle.load(f) | 30 | le = pickle.load(f) |
25 | stats = {} | 31 | stats = {} |
26 | 32 | ||
27 | print("PREDICTIONS ---------------------------") | 33 | print("PREDICTIONS ---------------------------") |
28 | for id_, predictions_ in predictions.items(): | 34 | for id_, predictions_ in predictions.items(): |
29 | label = labels[id_][0] | 35 | label = labels[id_][0] |
30 | if label not in stats: | 36 | if label not in stats: |
31 | stats[label] = { | 37 | stats[label] = { |
32 | "nb_utt": 1, | 38 | "nb_utt": 1, |
33 | "predictions": np.expand_dims(predictions_, axis=0) | 39 | "predictions": np.expand_dims(predictions_, axis=0) |
34 | } | 40 | } |
35 | else: | 41 | else: |
36 | stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1 | 42 | stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1 |
37 | stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0) | 43 | stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0) |
38 | |||
39 | 44 | ||
40 | print("CALCULATING ---------------------------") | ||
41 | |||
42 | |||
43 | colors = [ | 45 | colors = [ |
44 | "darkorange", | 46 | "darkorange", |
45 | "red", | 47 | "red", |
46 | "blue" | 48 | "blue" |
47 | ] | 49 | ] |
48 | custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) * | 50 | custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) * |
49 | cycler(linestyle=['-', '--', '-.'])) | 51 | cycler(linestyle=['-', '--', '-.'])) |
50 | 52 | ||
53 | print("CALCULATING ---------------------------") | ||
51 | 54 | ||
52 | for label, stats_ in stats.items(): | 55 | for label, stats_ in stats.items(): |
53 | 56 | ||
54 | plt.gca().set_prop_cycle(custom_cycler) | 57 | plt.gca().set_prop_cycle(custom_cycler) |
55 | stats_mean = np.mean(stats_["predictions"], axis=0) | 58 | stats_mean = np.mean(stats_["predictions"], axis=0) |
56 | stats_std = np.std(stats_["predictions"], axis=0) | 59 | stats_std = np.std(stats_["predictions"], axis=0) |
57 | 60 | ||
58 | #print(label) | 61 | #print(label) |
59 | #print(stats_mean) | 62 | #print(stats_mean) |
60 | #print(stats_std) | 63 | #print(stats_std) |
61 | kwargs = dict(alpha=0.5) | 64 | kwargs = dict(alpha=0.5) |
62 | 65 | ||
63 | for i in range(stats_["predictions"].shape[1]): | 66 | for i in range(stats_["predictions"].shape[1]): |
64 | label_str = le.inverse_transform([i])[0] | 67 | label_str = le.inverse_transform([i])[0] |
65 | #plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs) | 68 | #plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs) |
66 | mu = stats_mean[i] | 69 | mu = stats_mean[i] |
67 | variance = stats_std[i] * stats_std[i] | 70 | variance = stats_std[i] * stats_std[i] |
68 | sigma = stats_std[i] | 71 | sigma = stats_std[i] |
69 | # math.sqrt(variance) | ||
70 | print(f"{i}: mu {mu}, var {variance}, sigma {sigma}") | 72 | print(f"{i}: mu {mu}, var {variance}, sigma {sigma}") |
71 | 73 | ||
72 | #x_values = np.arange(-1, 5, 0.1) | 74 | #x_values = np.arange(-1, 5, 0.1) |
73 | 75 | ||
74 | #y_values = scipy.stats.norm(mu, variance) | 76 | #y_values = scipy.stats.norm(mu, variance) |
75 | #y = scipy.stats.norm.pdf(x,mean,std) | 77 | #y = scipy.stats.norm.pdf(x,mean,std) |
76 | 78 | ||
77 | #plt.plot(x_values, y_values.pdf(x_values,)) | 79 | #plt.plot(x_values, y_values.pdf(x_values,)) |
78 | 80 | ||
79 | #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True) | 81 | #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True) |
80 | x = np.linspace(0, 1, 1000) | 82 | x = np.linspace(0, 1, 1000) |
81 | #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000) | 83 | #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000) |
82 | #x, step = np.linspace(0, 1, 1000, retstep=True) | 84 | #x, step = np.linspace(0, 1, 1000, retstep=True) |
83 | 85 | ||
84 | P = scipy.stats.norm.cdf(x, mu, sigma) | 86 | P = scipy.stats.norm.cdf(x, mu, sigma) |
85 | #print(step) | 87 | #print(step) |
86 | plt.plot(x, P, label=label_str, **kwargs) | 88 | plt.plot(x, P, label=label_str, **kwargs) |
87 | #plt.savefig("simple_gaussian.pdf") | 89 | #plt.savefig("simple_gaussian.pdf") |
88 | 90 | ||
89 | plt.legend() | 91 | plt.legend() |
90 | plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf")) | 92 | plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf")) |
91 | plt.clf() | 93 | plt.clf() |
92 | 94 | ||
93 | print("Decisions") | 95 | print("Decisions") |
94 | 96 | ||
95 | 97 | ||
96 | def pred_distribution_wt_sel(predictions: str, labels: str, labelencoder: str, outdir: str): | 98 | def pred_distribution_wt_sel(predictions: str, n: int, labels: str, labelencoder: str, outdir: str): |
97 | 99 | ||
100 | ''' | ||
101 | Distribution of the predictions with selection process. | ||
102 | |||
103 | 1) For each dimension, select the n individus with the maximum values for the focused dimension. | ||
104 | We name S_i the set of n selected individus for the dimension i. | ||
105 | 2) For each subset S_i, we plot the distribution of each dimension. | ||
106 | ''' | ||
107 | |||
108 | le = None | ||
109 | with open(args.labelencoder, "rb") as f: | ||
110 | le = pickle.load(f) | ||
111 | |||
98 | keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions) | 112 | keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions) |
99 | n = 3 | 113 | |
100 | print(matrix_preds.shape) | 114 | colors = [ |
115 | "darkorange", | ||
116 | "red", | ||
117 | "blue" | ||
118 | ] | ||
119 | custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) * | ||
120 | cycler(linestyle=['-', '--', '-.'])) | ||
121 | |||
122 | kwargs = dict(alpha=0.5) | ||
123 | |||
124 | stats_of = open(os.path.join(args.outdir, f"stats.txt"), "w") | ||
101 | for j in range(matrix_preds.shape[1]): | 125 | for j in range(matrix_preds.shape[1]): |
126 | |||
127 | label_focused = le.inverse_transform([j])[0] | ||
102 | indices = (-matrix_preds[:, j]).argsort()[:n] | 128 | indices = (-matrix_preds[:, j]).argsort()[:n] |
103 | print(f"INDICE: {j}") | 129 | |
104 | print("indices") | 130 | print(f"LABEL: {label_focused}", file=stats_of) |
105 | print(indices) | 131 | print(f"INDICE: {j}", file=stats_of) |
106 | print("Best values") | 132 | print("indices", file=stats_of) |
107 | print(matrix_preds[indices, j]) | 133 | print(indices, file=stats_of) |
108 | print("All dimensions of best values") | 134 | print("Best values", file=stats_of) |
109 | print(matrix_preds[indices]) | 135 | print(matrix_preds[indices, j], file=stats_of) |
110 | # Select the n best for each column | 136 | print("All dimensions of best values", file=stats_of) |
137 | print(matrix_preds[indices], file=stats_of) | ||
138 | |||
139 | # Use it to build a plot. | ||
140 | pred_ = matrix_preds[indices] | ||
141 | stats_mean = np.mean(pred_, axis=0) | ||
142 | stats_std = np.std(pred_, axis=0) | ||
143 | for i in range(matrix_preds.shape[1]): | ||
144 | label_str = le.inverse_transform([i])[0] | ||
145 | mu = stats_mean[i] | ||
146 | variance = stats_std[i] * stats_std[i] | ||
147 | sigma = stats_std[i] | ||
148 | |||
149 | print(f"{i}: mu {mu}, var {variance}, sigma {sigma}") | ||
150 | |||
151 | x = np.linspace(0, 1, 1000) | ||
152 | |||
153 | P = scipy.stats.norm.cdf(x, mu, sigma) | ||
154 | plt.plot(x, P, label=label_str, **kwargs) | ||
155 | |||
156 | plt.legend() | ||
157 | plt.savefig(os.path.join(args.outdir, f"{label_focused}_prediction_cdf.pdf")) | ||
158 | plt.clf() | ||
159 | stats_of.close() | ||
111 | pass | 160 | pass |
112 | 161 | ||
113 | 162 | ||
114 | def utt2dur(utt2dur: str, labels: str): | 163 | def utt2dur(utt2dur: str, labels: str): |
115 | if labels == None: | 164 | if labels == None: |
116 | pass | 165 | pass |
117 | else: | 166 | else: |
118 | pass | 167 | pass |
119 | 168 | ||
120 | durations = [] | 169 | durations = [] |
121 | with open(utt2dur, "r") as f: | 170 | with open(utt2dur, "r") as f: |
122 | for line in f: | 171 | for line in f: |
123 | splited = line.replace("\n", "").split(" ") | 172 | splited = line.replace("\n", "").split(" ") |
124 | durations.append(float(splited[1])) | 173 | durations.append(float(splited[1])) |
125 | 174 | ||
126 | durations = np.asarray(durations, dtype=float) | 175 | durations = np.asarray(durations, dtype=float) |
127 | print(durations.shape) | 176 | print(durations.shape) |
128 | mean = np.mean(durations) | 177 | mean = np.mean(durations) |
129 | std = np.std(durations) | 178 | std = np.std(durations) |
130 | 179 | ||
131 | print(f"mean: {mean}") | 180 | print(f"mean: {mean}") |
132 | print(f"std: {std}") | 181 | print(f"std: {std}") |
133 | 182 | ||
134 | 183 | ||
135 | if __name__ == "__main__": | 184 | if __name__ == "__main__": |
136 | 185 | ||
137 | # Parser | 186 | # Parser |
138 | parser = argparse.ArgumentParser(description="Statistics") | 187 | parser = argparse.ArgumentParser(description="Statistics") |
139 | subparsers = parser.add_subparsers(title="actions") | 188 | subparsers = parser.add_subparsers(title="actions") |
140 | 189 | ||
141 | # pred-distribution | 190 | # pred-distribution |
142 | parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels") | 191 | parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels") |
143 | parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True) | 192 | parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True) |
144 | parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True) | 193 | parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True) |
145 | parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True) | 194 | parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True) |
146 | parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True) | 195 | parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True) |
147 | parser_pred_dist.set_defaults(which="pred_distribution") | 196 | parser_pred_dist.set_defaults(which="pred_distribution") |
148 | 197 | ||
149 | # pred-distribution-with-selection | 198 | # pred-distribution-with-selection |
150 | parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.") | 199 | parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.") |
151 | parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True) | 200 | parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True) |
201 | parser_pred_dist_wt_sel.add_argument("-n", type=int, help="Number of maximum selected for each prediction y_i.") | ||
152 | parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True) | 202 | parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True) |
153 | parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True) | 203 | parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True) |
154 | parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True) | 204 | parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True) |
155 | parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection") | 205 | parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection") |
206 | |||
156 | # duration-stats | 207 | # duration-stats |
157 | parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur") | 208 | parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur") |
158 | parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True) | 209 | parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True) |
159 | parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file") | 210 | parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file") |
160 | parser_utt2dur.set_defaults(which="utt2dur") | 211 | parser_utt2dur.set_defaults(which="utt2dur") |
161 | 212 | ||
162 | # Parse | 213 | # Parse |
163 | args = parser.parse_args() | 214 | args = parser.parse_args() |
164 | 215 | ||
165 | # Run commands | 216 | # Run commands |