Commit 765b51bc7741c15001b3983ef6df4d68eedbcd62
1 parent
d27fe6fcc5
Exists in
master
Little modification to synchronize
Showing 2 changed files with 51 additions and 2 deletions Inline Diff
volia/core/data.py
1 | ''' | 1 | ''' |
2 | Data management input/output | 2 | Data management input/output |
3 | ''' | 3 | ''' |
4 | 4 | ||
5 | # Import packages and modules | 5 | # Import packages and modules |
6 | import numpy as np | 6 | import numpy as np |
7 | import sys | 7 | import sys |
8 | 8 | ||
9 | # Defining some types | 9 | # Defining some types |
10 | from typing import List, Dict | 10 | from typing import List, Dict, Tuple |
11 | |||
12 | from numpy.lib.shape_base import expand_dims | ||
11 | KeyToList = Dict[str, List[str]] | 13 | KeyToList = Dict[str, List[str]] |
12 | KeyToLabels = Dict[str, List[str]] | 14 | KeyToLabels = Dict[str, List[str]] |
13 | KeyToIntLabels = Dict[str, List[int]] | 15 | KeyToIntLabels = Dict[str, List[int]] |
14 | KeyToFeatures = Dict[str, List[float]] | 16 | KeyToFeatures = Dict[str, List[float]] |
15 | 17 | ||
16 | 18 | ||
17 | def read_lst(file_path: str) -> KeyToList: | 19 | def read_lst(file_path: str) -> KeyToList: |
18 | ''' | 20 | ''' |
19 | Read lst file with this structure: | 21 | Read lst file with this structure: |
20 | [id_1] | 22 | [id_1] |
21 | [id_2] | 23 | [id_2] |
22 | ... | 24 | ... |
23 | [id_n] | 25 | [id_n] |
24 | 26 | ||
25 | Return a list of ids. | 27 | Return a list of ids. |
26 | ''' | 28 | ''' |
27 | lst = [] | 29 | lst = [] |
28 | with open(file_path, "r") as f: | 30 | with open(file_path, "r") as f: |
29 | for line in f: | 31 | for line in f: |
30 | lst.append(line.replace("\n", "")) | 32 | lst.append(line.replace("\n", "")) |
31 | return lst | 33 | return lst |
32 | 34 | ||
33 | 35 | ||
34 | def read_id_values(file_path: str, value_type=str): | 36 | def read_id_values(file_path: str, value_type=str): |
35 | ''' | 37 | ''' |
36 | Read file where each line is an id with its corresponding values: | 38 | Read file where each line is an id with its corresponding values: |
37 | [id_1] [value_1_1] [value_1_2] ... [value_1_k] | 39 | [id_1] [value_1_1] [value_1_2] ... [value_1_k] |
38 | [id_2] [value_2_1] [value_2_2] ... [value_2_k] | 40 | [id_2] [value_2_1] [value_2_2] ... [value_2_k] |
39 | ... | 41 | ... |
40 | [id_n] [value_n_1] [value_n_2] ... [value_n_k] | 42 | [id_n] [value_n_1] [value_n_2] ... [value_n_k] |
41 | 43 | ||
42 | where values are value_type type. | 44 | where values are value_type type. |
43 | 45 | ||
44 | Used in many reader functions with specific value_type. | 46 | Used in many reader functions with specific value_type. |
45 | Return a dictionary with id as key and values as associated values. | 47 | Return a dictionary with id as key and values as associated values. |
46 | ''' | 48 | ''' |
47 | id_values = {} | 49 | id_values = {} |
48 | with open(file_path, "r") as f: | 50 | with open(file_path, "r") as f: |
49 | for line in f: | 51 | for line in f: |
50 | splited = line.replace("\n", "").split(" ") | 52 | splited = line.replace("\n", "").split(" ") |
51 | id_values[splited[0]] = np.asarray(splited[1:], dtype=value_type) | 53 | id_values[splited[0]] = np.asarray(splited[1:], dtype=value_type) |
52 | return id_values | 54 | return id_values |
53 | 55 | ||
54 | 56 | ||
55 | def read_features(file_path: str) -> KeyToFeatures: | 57 | def read_features(file_path: str) -> KeyToFeatures: |
56 | ''' | 58 | ''' |
57 | Read features files with the following structure: | 59 | Read features files with the following structure: |
58 | [id_1] [value_1_1] [value_1_2] ... [value_1_k] | 60 | [id_1] [value_1_1] [value_1_2] ... [value_1_k] |
59 | [id_2] [value_2_1] [value_2_2] ... [value_2_k] | 61 | [id_2] [value_2_1] [value_2_2] ... [value_2_k] |
60 | ... | 62 | ... |
61 | [id_n] [value_n_1] [value_n_2] ... [value_n_k] | 63 | [id_n] [value_n_1] [value_n_2] ... [value_n_k] |
62 | 64 | ||
63 | where values are float | 65 | where values are float |
64 | 66 | ||
65 | Returns a dictionary with id as key and a list of values as associated values | 67 | Returns a dictionary with id as key and a list of values as associated values |
66 | ''' | 68 | ''' |
67 | return read_id_values(file_path, np.float64) | 69 | return read_id_values(file_path, np.float64) |
68 | 70 | ||
71 | |||
72 | def read_features_with_matrix(file_path: str) -> Tuple[List[str], np.ndarray]: | ||
73 | """Read a features file and returns the keys (utterances ids) | ||
74 | with the corresponding matrix of values. | ||
75 | |||
76 | Args: | ||
77 | file_path (str): path of the features file | ||
78 | |||
79 | Returns: | ||
80 | [Tuple(List[str], np.ndarray)]: a tuple with a list of keys and the matrix | ||
81 | """ | ||
82 | data = read_id_values(file_path, np.float64) | ||
83 | keys = [] | ||
84 | matrix = None | ||
85 | for key, values in data.items(): | ||
86 | keys.append(key) | ||
87 | if matrix is None: | ||
88 | matrix = np.expand_dims(values, axis=0) | ||
89 | matrix = np.append(matrix, np.expand_dims(values, axis=0), axis=0) | ||
90 | |||
91 | return (keys, matrix) | ||
69 | 92 | ||
70 | def read_labels(file_path: str) -> KeyToLabels: | 93 | def read_labels(file_path: str) -> KeyToLabels: |
71 | ''' | 94 | ''' |
72 | Read features files with the following structure : | 95 | Read features files with the following structure : |
73 | [id_1] [value_1_1] [value_1_2] ... [value_1_k] | 96 | [id_1] [value_1_1] [value_1_2] ... [value_1_k] |
74 | [id_2] [value_2_1] [value_2_2] ... [value_2_k] | 97 | [id_2] [value_2_1] [value_2_2] ... [value_2_k] |
75 | ... | 98 | ... |
76 | [id_n] [value_n_1] [value_n_2] ... [value_n_k] | 99 | [id_n] [value_n_1] [value_n_2] ... [value_n_k] |
77 | 100 | ||
78 | where values are int | 101 | where values are int |
79 | ''' | 102 | ''' |
80 | return read_id_values(file_path, str) | 103 | return read_id_values(file_path, str) |
81 | 104 | ||
82 | 105 | ||
83 | def read_labels_integer(file_path: str) -> KeyToIntLabels: | 106 | def read_labels_integer(file_path: str) -> KeyToIntLabels: |
84 | ''' | 107 | ''' |
85 | Read features files with the following structure : | 108 | Read features files with the following structure : |
86 | [id_1] [value_1_1] [value_1_2] ... [value_1_k] | 109 | [id_1] [value_1_1] [value_1_2] ... [value_1_k] |
87 | [id_2] [value_2_1] [value_2_2] ... [value_2_k] | 110 | [id_2] [value_2_1] [value_2_2] ... [value_2_k] |
88 | ... | 111 | ... |
89 | [id_n] [value_n_1] [value_n_2] ... [value_n_k] | 112 | [id_n] [value_n_1] [value_n_2] ... [value_n_k] |
90 | 113 | ||
91 | where values are int | 114 | where values are int |
92 | ''' | 115 | ''' |
93 | return read_id_values(file_path, int) | 116 | return read_id_values(file_path, int) |
94 | 117 | ||
95 | 118 | ||
96 | def write_line(id_, values=[], out=sys.stdout): | 119 | def write_line(id_, values=[], out=sys.stdout): |
97 | """ | 120 | """ |
98 | Write a line in list, labels or features files. | 121 | Write a line in list, labels or features files. |
99 | If you want to write a list, specify an empty | 122 | If you want to write a list, specify an empty |
100 | array for *values*. | 123 | array for *values*. |
101 | 124 | ||
102 | Args: | 125 | Args: |
103 | id_ (str): id in string. | 126 | id_ (str): id in string. |
104 | values (list, optional): list of values to write, features or labels. Defaults to []. | 127 | values (list, optional): list of values to write, features or labels. Defaults to []. |
105 | out (_io.TextIOWrapper, optional): . Defaults to sys.stdout. | 128 | out (_io.TextIOWrapper, optional): . Defaults to sys.stdout. |
106 | """ | 129 | """ |
107 | if len(values) == 0: | 130 | if len(values) == 0: |
108 | out.write(str(id_) + "\n") | 131 | out.write(str(id_) + "\n") |
109 | else: | 132 | else: |
110 | out.write(str(id_) + " " + " ".join(values) + "\n") | 133 | out.write(str(id_) + " " + " ".join(values) + "\n") |
volia/stats.py
1 | 1 | ||
2 | import argparse | 2 | import argparse |
3 | 3 | ||
4 | import os | 4 | import os |
5 | import core.data | 5 | import core.data |
6 | import math | 6 | import math |
7 | import numpy as np | 7 | import numpy as np |
8 | import scipy.stats | 8 | import scipy.stats |
9 | import pickle | 9 | import pickle |
10 | import matplotlib.pyplot as plt | 10 | import matplotlib.pyplot as plt |
11 | import matplotlib.colors as mcolors | 11 | import matplotlib.colors as mcolors |
12 | from utils import SubCommandRunner | 12 | from utils import SubCommandRunner |
13 | 13 | ||
14 | 14 | ||
15 | from cycler import cycler | 15 | from cycler import cycler |
16 | 16 | ||
17 | def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str): | 17 | def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str): |
18 | 18 | ||
19 | predictions = core.data.read_id_values(args.predictions, float) | 19 | predictions = core.data.read_id_values(args.predictions, float) |
20 | labels = core.data.read_labels(args.labels) | 20 | labels = core.data.read_labels(args.labels) |
21 | 21 | ||
22 | le = None | 22 | le = None |
23 | with open(args.labelencoder, "rb") as f: | 23 | with open(args.labelencoder, "rb") as f: |
24 | le = pickle.load(f) | 24 | le = pickle.load(f) |
25 | stats = {} | 25 | stats = {} |
26 | 26 | ||
27 | print("PREDICTIONS ---------------------------") | 27 | print("PREDICTIONS ---------------------------") |
28 | for id_, predictions_ in predictions.items(): | 28 | for id_, predictions_ in predictions.items(): |
29 | label = labels[id_][0] | 29 | label = labels[id_][0] |
30 | if label not in stats: | 30 | if label not in stats: |
31 | stats[label] = { | 31 | stats[label] = { |
32 | "nb_utt": 1, | 32 | "nb_utt": 1, |
33 | "predictions": np.expand_dims(predictions_, axis=0) | 33 | "predictions": np.expand_dims(predictions_, axis=0) |
34 | } | 34 | } |
35 | else: | 35 | else: |
36 | stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1 | 36 | stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1 |
37 | stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0) | 37 | stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0) |
38 | 38 | ||
39 | 39 | ||
40 | print("CALCULATING ---------------------------") | 40 | print("CALCULATING ---------------------------") |
41 | 41 | ||
42 | 42 | ||
43 | colors = [ | 43 | colors = [ |
44 | "darkorange", | 44 | "darkorange", |
45 | "red", | 45 | "red", |
46 | "blue" | 46 | "blue" |
47 | ] | 47 | ] |
48 | custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) * | 48 | custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) * |
49 | cycler(linestyle=['-', '--', '-.'])) | 49 | cycler(linestyle=['-', '--', '-.'])) |
50 | 50 | ||
51 | 51 | ||
52 | for label, stats_ in stats.items(): | 52 | for label, stats_ in stats.items(): |
53 | 53 | ||
54 | plt.gca().set_prop_cycle(custom_cycler) | 54 | plt.gca().set_prop_cycle(custom_cycler) |
55 | stats_mean = np.mean(stats_["predictions"], axis=0) | 55 | stats_mean = np.mean(stats_["predictions"], axis=0) |
56 | stats_std = np.std(stats_["predictions"], axis=0) | 56 | stats_std = np.std(stats_["predictions"], axis=0) |
57 | 57 | ||
58 | #print(label) | 58 | #print(label) |
59 | #print(stats_mean) | 59 | #print(stats_mean) |
60 | #print(stats_std) | 60 | #print(stats_std) |
61 | kwargs = dict(alpha=0.5) | 61 | kwargs = dict(alpha=0.5) |
62 | 62 | ||
63 | for i in range(stats_["predictions"].shape[1]): | 63 | for i in range(stats_["predictions"].shape[1]): |
64 | label_str = le.inverse_transform([i])[0] | 64 | label_str = le.inverse_transform([i])[0] |
65 | #plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs) | 65 | #plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs) |
66 | mu = stats_mean[i] | 66 | mu = stats_mean[i] |
67 | variance = stats_std[i] * stats_std[i] | 67 | variance = stats_std[i] * stats_std[i] |
68 | sigma = stats_std[i] | 68 | sigma = stats_std[i] |
69 | # math.sqrt(variance) | 69 | # math.sqrt(variance) |
70 | print(f"{i}: mu {mu}, var {variance}, sigma {sigma}") | 70 | print(f"{i}: mu {mu}, var {variance}, sigma {sigma}") |
71 | 71 | ||
72 | #x_values = np.arange(-1, 5, 0.1) | 72 | #x_values = np.arange(-1, 5, 0.1) |
73 | 73 | ||
74 | #y_values = scipy.stats.norm(mu, variance) | 74 | #y_values = scipy.stats.norm(mu, variance) |
75 | #y = scipy.stats.norm.pdf(x,mean,std) | 75 | #y = scipy.stats.norm.pdf(x,mean,std) |
76 | 76 | ||
77 | #plt.plot(x_values, y_values.pdf(x_values,)) | 77 | #plt.plot(x_values, y_values.pdf(x_values,)) |
78 | 78 | ||
79 | #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True) | 79 | #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True) |
80 | x = np.linspace(0, 1, 1000) | 80 | x = np.linspace(0, 1, 1000) |
81 | #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000) | 81 | #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000) |
82 | #x, step = np.linspace(0, 1, 1000, retstep=True) | 82 | #x, step = np.linspace(0, 1, 1000, retstep=True) |
83 | 83 | ||
84 | P = scipy.stats.norm.cdf(x, mu, sigma) | 84 | P = scipy.stats.norm.cdf(x, mu, sigma) |
85 | #print(step) | 85 | #print(step) |
86 | plt.plot(x, P, label=label_str, **kwargs) | 86 | plt.plot(x, P, label=label_str, **kwargs) |
87 | #plt.savefig("simple_gaussian.pdf") | 87 | #plt.savefig("simple_gaussian.pdf") |
88 | 88 | ||
89 | plt.legend() | 89 | plt.legend() |
90 | plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf")) | 90 | plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf")) |
91 | plt.clf() | 91 | plt.clf() |
92 | 92 | ||
93 | print("Decisions") | 93 | print("Decisions") |
94 | 94 | ||
95 | 95 | ||
96 | def pred_distribution_wt_sel(predictions: str, labels: str, labelencoder: str, outdir: str): | ||
97 | |||
98 | keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions) | ||
99 | n = 3 | ||
100 | print(matrix_preds.shape) | ||
101 | for j in range(matrix_preds.shape[1]): | ||
102 | indices = (-matrix_preds[:, j]).argsort()[:n] | ||
103 | print(f"INDICE: {j}") | ||
104 | print("indices") | ||
105 | print(indices) | ||
106 | print("Best values") | ||
107 | print(matrix_preds[indices, j]) | ||
108 | print("All dimensions of best values") | ||
109 | print(matrix_preds[indices]) | ||
110 | # Select the n best for each column | ||
111 | pass | ||
112 | |||
113 | |||
96 | def utt2dur(utt2dur: str, labels: str): | 114 | def utt2dur(utt2dur: str, labels: str): |
97 | if labels == None: | 115 | if labels == None: |
98 | pass | 116 | pass |
99 | else: | 117 | else: |
100 | pass | 118 | pass |
101 | 119 | ||
102 | durations = [] | 120 | durations = [] |
103 | with open(utt2dur, "r") as f: | 121 | with open(utt2dur, "r") as f: |
104 | for line in f: | 122 | for line in f: |
105 | splited = line.replace("\n", "").split(" ") | 123 | splited = line.replace("\n", "").split(" ") |
106 | durations.append(float(splited[1])) | 124 | durations.append(float(splited[1])) |
107 | 125 | ||
108 | durations = np.asarray(durations, dtype=float) | 126 | durations = np.asarray(durations, dtype=float) |
109 | print(durations.shape) | 127 | print(durations.shape) |
110 | mean = np.mean(durations) | 128 | mean = np.mean(durations) |
111 | std = np.std(durations) | 129 | std = np.std(durations) |
112 | 130 | ||
113 | print(f"mean: {mean}") | 131 | print(f"mean: {mean}") |
114 | print(f"std: {std}") | 132 | print(f"std: {std}") |
115 | 133 | ||
116 | 134 | ||
117 | if __name__ == "__main__": | 135 | if __name__ == "__main__": |
118 | 136 | ||
119 | # Parser | 137 | # Parser |
120 | parser = argparse.ArgumentParser(description="Statistics") | 138 | parser = argparse.ArgumentParser(description="Statistics") |
121 | subparsers = parser.add_subparsers(title="actions") | 139 | subparsers = parser.add_subparsers(title="actions") |
122 | 140 | ||
123 | # pred-distribution | 141 | # pred-distribution |
124 | parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels") | 142 | parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels") |
125 | parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True) | 143 | parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True) |
126 | parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True) | 144 | parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True) |
127 | parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True) | 145 | parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True) |
128 | parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True) | 146 | parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True) |
129 | parser_pred_dist.set_defaults(which="pred_distribution") | 147 | parser_pred_dist.set_defaults(which="pred_distribution") |
130 | 148 | ||
149 | # pred-distribution-with-selection | ||
150 | parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.") | ||
151 | parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True) | ||
152 | parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True) | ||
153 | parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True) | ||
154 | parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True) | ||
155 | parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection") | ||
131 | # duration-stats | 156 | # duration-stats |
132 | parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur") | 157 | parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur") |
133 | parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True) | 158 | parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True) |
134 | parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file") | 159 | parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file") |
135 | parser_utt2dur.set_defaults(which="utt2dur") | 160 | parser_utt2dur.set_defaults(which="utt2dur") |
136 | 161 | ||
137 | # Parse | 162 | # Parse |
138 | args = parser.parse_args() | 163 | args = parser.parse_args() |
139 | 164 | ||
140 | # Run commands | 165 | # Run commands |
141 | runner = SubCommandRunner({ | 166 | runner = SubCommandRunner({ |
142 | "pred-distribution": pred_distribution, | 167 | "pred_distribution": pred_distribution, |
168 | "pred_distribution_with_selection": pred_distribution_wt_sel, | ||
143 | "utt2dur": utt2dur | 169 | "utt2dur": utt2dur |
144 | }) | 170 | }) |
145 | 171 | ||
146 | runner.run(args.which, args.__dict__, remove="which") | 172 | runner.run(args.which, args.__dict__, remove="which") |