Commit 765b51bc7741c15001b3983ef6df4d68eedbcd62

Authored by Quillot Mathias
1 parent d27fe6fcc5
Exists in master

Little modification to synchronize

Showing 2 changed files with 51 additions and 2 deletions Inline Diff

1 ''' 1 '''
2 Data management input/output 2 Data management input/output
3 ''' 3 '''
4 4
5 # Import packages and modules 5 # Import packages and modules
6 import numpy as np 6 import numpy as np
7 import sys 7 import sys
8 8
9 # Defining some types 9 # Defining some types
10 from typing import List, Dict 10 from typing import List, Dict, Tuple
11
12 from numpy.lib.shape_base import expand_dims
11 KeyToList = Dict[str, List[str]] 13 KeyToList = Dict[str, List[str]]
12 KeyToLabels = Dict[str, List[str]] 14 KeyToLabels = Dict[str, List[str]]
13 KeyToIntLabels = Dict[str, List[int]] 15 KeyToIntLabels = Dict[str, List[int]]
14 KeyToFeatures = Dict[str, List[float]] 16 KeyToFeatures = Dict[str, List[float]]
15 17
16 18
17 def read_lst(file_path: str) -> KeyToList: 19 def read_lst(file_path: str) -> KeyToList:
18 ''' 20 '''
19 Read lst file with this structure: 21 Read lst file with this structure:
20 [id_1] 22 [id_1]
21 [id_2] 23 [id_2]
22 ... 24 ...
23 [id_n] 25 [id_n]
24 26
25 Return a list of ids. 27 Return a list of ids.
26 ''' 28 '''
27 lst = [] 29 lst = []
28 with open(file_path, "r") as f: 30 with open(file_path, "r") as f:
29 for line in f: 31 for line in f:
30 lst.append(line.replace("\n", "")) 32 lst.append(line.replace("\n", ""))
31 return lst 33 return lst
32 34
33 35
34 def read_id_values(file_path: str, value_type=str): 36 def read_id_values(file_path: str, value_type=str):
35 ''' 37 '''
36 Read file where each line is an id with its corresponding values: 38 Read file where each line is an id with its corresponding values:
37 [id_1] [value_1_1] [value_1_2] ... [value_1_k] 39 [id_1] [value_1_1] [value_1_2] ... [value_1_k]
38 [id_2] [value_2_1] [value_2_2] ... [value_2_k] 40 [id_2] [value_2_1] [value_2_2] ... [value_2_k]
39 ... 41 ...
40 [id_n] [value_n_1] [value_n_2] ... [value_n_k] 42 [id_n] [value_n_1] [value_n_2] ... [value_n_k]
41 43
42 where values are value_type type. 44 where values are value_type type.
43 45
44 Used in many reader functions with specific value_type. 46 Used in many reader functions with specific value_type.
45 Return a dictionary with id as key and values as associated values. 47 Return a dictionary with id as key and values as associated values.
46 ''' 48 '''
47 id_values = {} 49 id_values = {}
48 with open(file_path, "r") as f: 50 with open(file_path, "r") as f:
49 for line in f: 51 for line in f:
50 splited = line.replace("\n", "").split(" ") 52 splited = line.replace("\n", "").split(" ")
51 id_values[splited[0]] = np.asarray(splited[1:], dtype=value_type) 53 id_values[splited[0]] = np.asarray(splited[1:], dtype=value_type)
52 return id_values 54 return id_values
53 55
54 56
55 def read_features(file_path: str) -> KeyToFeatures: 57 def read_features(file_path: str) -> KeyToFeatures:
56 ''' 58 '''
57 Read features files with the following structure: 59 Read features files with the following structure:
58 [id_1] [value_1_1] [value_1_2] ... [value_1_k] 60 [id_1] [value_1_1] [value_1_2] ... [value_1_k]
59 [id_2] [value_2_1] [value_2_2] ... [value_2_k] 61 [id_2] [value_2_1] [value_2_2] ... [value_2_k]
60 ... 62 ...
61 [id_n] [value_n_1] [value_n_2] ... [value_n_k] 63 [id_n] [value_n_1] [value_n_2] ... [value_n_k]
62 64
63 where values are float 65 where values are float
64 66
65 Returns a dictionary with id as key and a list of values as associated values 67 Returns a dictionary with id as key and a list of values as associated values
66 ''' 68 '''
67 return read_id_values(file_path, np.float64) 69 return read_id_values(file_path, np.float64)
68 70
71
72 def read_features_with_matrix(file_path: str) -> Tuple[List[str], np.ndarray]:
73 """Read a features file and returns the keys (utterances ids)
74 with the corresponding matrix of values.
75
76 Args:
77 file_path (str): path of the features file
78
79 Returns:
80 [Tuple(List[str], np.ndarray)]: a tuple with a list of keys and the matrix
81 """
82 data = read_id_values(file_path, np.float64)
83 keys = []
84 matrix = None
85 for key, values in data.items():
86 keys.append(key)
87 if matrix is None:
88 matrix = np.expand_dims(values, axis=0)
89 matrix = np.append(matrix, np.expand_dims(values, axis=0), axis=0)
90
91 return (keys, matrix)
69 92
70 def read_labels(file_path: str) -> KeyToLabels: 93 def read_labels(file_path: str) -> KeyToLabels:
71 ''' 94 '''
72 Read features files with the following structure : 95 Read features files with the following structure :
73 [id_1] [value_1_1] [value_1_2] ... [value_1_k] 96 [id_1] [value_1_1] [value_1_2] ... [value_1_k]
74 [id_2] [value_2_1] [value_2_2] ... [value_2_k] 97 [id_2] [value_2_1] [value_2_2] ... [value_2_k]
75 ... 98 ...
76 [id_n] [value_n_1] [value_n_2] ... [value_n_k] 99 [id_n] [value_n_1] [value_n_2] ... [value_n_k]
77 100
78 where values are int 101 where values are int
79 ''' 102 '''
80 return read_id_values(file_path, str) 103 return read_id_values(file_path, str)
81 104
82 105
83 def read_labels_integer(file_path: str) -> KeyToIntLabels: 106 def read_labels_integer(file_path: str) -> KeyToIntLabels:
84 ''' 107 '''
85 Read features files with the following structure : 108 Read features files with the following structure :
86 [id_1] [value_1_1] [value_1_2] ... [value_1_k] 109 [id_1] [value_1_1] [value_1_2] ... [value_1_k]
87 [id_2] [value_2_1] [value_2_2] ... [value_2_k] 110 [id_2] [value_2_1] [value_2_2] ... [value_2_k]
88 ... 111 ...
89 [id_n] [value_n_1] [value_n_2] ... [value_n_k] 112 [id_n] [value_n_1] [value_n_2] ... [value_n_k]
90 113
91 where values are int 114 where values are int
92 ''' 115 '''
93 return read_id_values(file_path, int) 116 return read_id_values(file_path, int)
94 117
95 118
96 def write_line(id_, values=[], out=sys.stdout): 119 def write_line(id_, values=[], out=sys.stdout):
97 """ 120 """
98 Write a line in list, labels or features files. 121 Write a line in list, labels or features files.
99 If you want to write a list, specify an empty 122 If you want to write a list, specify an empty
100 array for *values*. 123 array for *values*.
101 124
102 Args: 125 Args:
103 id_ (str): id in string. 126 id_ (str): id in string.
104 values (list, optional): list of values to write, features or labels. Defaults to []. 127 values (list, optional): list of values to write, features or labels. Defaults to [].
105 out (_io.TextIOWrapper, optional): . Defaults to sys.stdout. 128 out (_io.TextIOWrapper, optional): . Defaults to sys.stdout.
106 """ 129 """
107 if len(values) == 0: 130 if len(values) == 0:
108 out.write(str(id_) + "\n") 131 out.write(str(id_) + "\n")
109 else: 132 else:
110 out.write(str(id_) + " " + " ".join(values) + "\n") 133 out.write(str(id_) + " " + " ".join(values) + "\n")
1 1
2 import argparse 2 import argparse
3 3
4 import os 4 import os
5 import core.data 5 import core.data
6 import math 6 import math
7 import numpy as np 7 import numpy as np
8 import scipy.stats 8 import scipy.stats
9 import pickle 9 import pickle
10 import matplotlib.pyplot as plt 10 import matplotlib.pyplot as plt
11 import matplotlib.colors as mcolors 11 import matplotlib.colors as mcolors
12 from utils import SubCommandRunner 12 from utils import SubCommandRunner
13 13
14 14
15 from cycler import cycler 15 from cycler import cycler
16 16
17 def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str): 17 def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str):
18 18
19 predictions = core.data.read_id_values(args.predictions, float) 19 predictions = core.data.read_id_values(args.predictions, float)
20 labels = core.data.read_labels(args.labels) 20 labels = core.data.read_labels(args.labels)
21 21
22 le = None 22 le = None
23 with open(args.labelencoder, "rb") as f: 23 with open(args.labelencoder, "rb") as f:
24 le = pickle.load(f) 24 le = pickle.load(f)
25 stats = {} 25 stats = {}
26 26
27 print("PREDICTIONS ---------------------------") 27 print("PREDICTIONS ---------------------------")
28 for id_, predictions_ in predictions.items(): 28 for id_, predictions_ in predictions.items():
29 label = labels[id_][0] 29 label = labels[id_][0]
30 if label not in stats: 30 if label not in stats:
31 stats[label] = { 31 stats[label] = {
32 "nb_utt": 1, 32 "nb_utt": 1,
33 "predictions": np.expand_dims(predictions_, axis=0) 33 "predictions": np.expand_dims(predictions_, axis=0)
34 } 34 }
35 else: 35 else:
36 stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1 36 stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1
37 stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0) 37 stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0)
38 38
39 39
40 print("CALCULATING ---------------------------") 40 print("CALCULATING ---------------------------")
41 41
42 42
43 colors = [ 43 colors = [
44 "darkorange", 44 "darkorange",
45 "red", 45 "red",
46 "blue" 46 "blue"
47 ] 47 ]
48 custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) * 48 custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
49 cycler(linestyle=['-', '--', '-.'])) 49 cycler(linestyle=['-', '--', '-.']))
50 50
51 51
52 for label, stats_ in stats.items(): 52 for label, stats_ in stats.items():
53 53
54 plt.gca().set_prop_cycle(custom_cycler) 54 plt.gca().set_prop_cycle(custom_cycler)
55 stats_mean = np.mean(stats_["predictions"], axis=0) 55 stats_mean = np.mean(stats_["predictions"], axis=0)
56 stats_std = np.std(stats_["predictions"], axis=0) 56 stats_std = np.std(stats_["predictions"], axis=0)
57 57
58 #print(label) 58 #print(label)
59 #print(stats_mean) 59 #print(stats_mean)
60 #print(stats_std) 60 #print(stats_std)
61 kwargs = dict(alpha=0.5) 61 kwargs = dict(alpha=0.5)
62 62
63 for i in range(stats_["predictions"].shape[1]): 63 for i in range(stats_["predictions"].shape[1]):
64 label_str = le.inverse_transform([i])[0] 64 label_str = le.inverse_transform([i])[0]
65 #plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs) 65 #plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs)
66 mu = stats_mean[i] 66 mu = stats_mean[i]
67 variance = stats_std[i] * stats_std[i] 67 variance = stats_std[i] * stats_std[i]
68 sigma = stats_std[i] 68 sigma = stats_std[i]
69 # math.sqrt(variance) 69 # math.sqrt(variance)
70 print(f"{i}: mu {mu}, var {variance}, sigma {sigma}") 70 print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")
71 71
72 #x_values = np.arange(-1, 5, 0.1) 72 #x_values = np.arange(-1, 5, 0.1)
73 73
74 #y_values = scipy.stats.norm(mu, variance) 74 #y_values = scipy.stats.norm(mu, variance)
75 #y = scipy.stats.norm.pdf(x,mean,std) 75 #y = scipy.stats.norm.pdf(x,mean,std)
76 76
77 #plt.plot(x_values, y_values.pdf(x_values,)) 77 #plt.plot(x_values, y_values.pdf(x_values,))
78 78
79 #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True) 79 #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True)
80 x = np.linspace(0, 1, 1000) 80 x = np.linspace(0, 1, 1000)
81 #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000) 81 #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000)
82 #x, step = np.linspace(0, 1, 1000, retstep=True) 82 #x, step = np.linspace(0, 1, 1000, retstep=True)
83 83
84 P = scipy.stats.norm.cdf(x, mu, sigma) 84 P = scipy.stats.norm.cdf(x, mu, sigma)
85 #print(step) 85 #print(step)
86 plt.plot(x, P, label=label_str, **kwargs) 86 plt.plot(x, P, label=label_str, **kwargs)
87 #plt.savefig("simple_gaussian.pdf") 87 #plt.savefig("simple_gaussian.pdf")
88 88
89 plt.legend() 89 plt.legend()
90 plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf")) 90 plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf"))
91 plt.clf() 91 plt.clf()
92 92
93 print("Decisions") 93 print("Decisions")
94 94
95 95
96 def pred_distribution_wt_sel(predictions: str, labels: str, labelencoder: str, outdir: str):
97
98 keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions)
99 n = 3
100 print(matrix_preds.shape)
101 for j in range(matrix_preds.shape[1]):
102 indices = (-matrix_preds[:, j]).argsort()[:n]
103 print(f"INDICE: {j}")
104 print("indices")
105 print(indices)
106 print("Best values")
107 print(matrix_preds[indices, j])
108 print("All dimensions of best values")
109 print(matrix_preds[indices])
110 # Select the n best for each column
111 pass
112
113
96 def utt2dur(utt2dur: str, labels: str): 114 def utt2dur(utt2dur: str, labels: str):
97 if labels == None: 115 if labels == None:
98 pass 116 pass
99 else: 117 else:
100 pass 118 pass
101 119
102 durations = [] 120 durations = []
103 with open(utt2dur, "r") as f: 121 with open(utt2dur, "r") as f:
104 for line in f: 122 for line in f:
105 splited = line.replace("\n", "").split(" ") 123 splited = line.replace("\n", "").split(" ")
106 durations.append(float(splited[1])) 124 durations.append(float(splited[1]))
107 125
108 durations = np.asarray(durations, dtype=float) 126 durations = np.asarray(durations, dtype=float)
109 print(durations.shape) 127 print(durations.shape)
110 mean = np.mean(durations) 128 mean = np.mean(durations)
111 std = np.std(durations) 129 std = np.std(durations)
112 130
113 print(f"mean: {mean}") 131 print(f"mean: {mean}")
114 print(f"std: {std}") 132 print(f"std: {std}")
115 133
116 134
117 if __name__ == "__main__": 135 if __name__ == "__main__":
118 136
119 # Parser 137 # Parser
120 parser = argparse.ArgumentParser(description="Statistics") 138 parser = argparse.ArgumentParser(description="Statistics")
121 subparsers = parser.add_subparsers(title="actions") 139 subparsers = parser.add_subparsers(title="actions")
122 140
123 # pred-distribution 141 # pred-distribution
124 parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels") 142 parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels")
125 parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True) 143 parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True)
126 parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True) 144 parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True)
127 parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True) 145 parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
128 parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True) 146 parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True)
129 parser_pred_dist.set_defaults(which="pred_distribution") 147 parser_pred_dist.set_defaults(which="pred_distribution")
130 148
149 # pred-distribution-with-selection
150 parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.")
151 parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True)
152 parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True)
153 parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
154 parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True)
155 parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection")
131 # duration-stats 156 # duration-stats
132 parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur") 157 parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur")
133 parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True) 158 parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True)
134 parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file") 159 parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file")
135 parser_utt2dur.set_defaults(which="utt2dur") 160 parser_utt2dur.set_defaults(which="utt2dur")
136 161
137 # Parse 162 # Parse
138 args = parser.parse_args() 163 args = parser.parse_args()
139 164
140 # Run commands 165 # Run commands
141 runner = SubCommandRunner({ 166 runner = SubCommandRunner({
142 "pred-distribution": pred_distribution, 167 "pred_distribution": pred_distribution,
168 "pred_distribution_with_selection": pred_distribution_wt_sel,
143 "utt2dur": utt2dur 169 "utt2dur": utt2dur
144 }) 170 })
145 171
146 runner.run(args.which, args.__dict__, remove="which") 172 runner.run(args.which, args.__dict__, remove="which")