Commit 9bb5ff657bf803e1ce5a403f9998e700bf3a3f72

Authored by quillotm
1 parent 78b39d22dd
Exists in master

Adding n argument to pred_distribution_wt_sel

Adding some comments

Showing 1 changed file with 71 additions and 20 deletions Side-by-side Diff

... ... @@ -10,12 +10,18 @@
10 10 import matplotlib.pyplot as plt
11 11 import matplotlib.colors as mcolors
12 12 from utils import SubCommandRunner
13   -
14   -
15 13 from cycler import cycler
16 14  
  15 +
17 16 def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str):
  17 + '''
  18 + Distribution of the prediction.
18 19  
  20 + For each label, we plot the distribution of the class predicted.
  21 + For example, for each character, we plot the distribution of the characters predicted.
  22 + Another example, for each speaker, we plot the distribution of the characters predicted.
  23 +
  24 + '''
19 25 predictions = core.data.read_id_values(args.predictions, float)
20 26 labels = core.data.read_labels(args.labels)
21 27  
22 28  
... ... @@ -35,11 +41,7 @@
35 41 else:
36 42 stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1
37 43 stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0)
38   -
39 44  
40   - print("CALCULATING ---------------------------")
41   -
42   -
43 45 colors = [
44 46 "darkorange",
45 47 "red",
46 48  
... ... @@ -48,13 +50,14 @@
48 50 custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
49 51 cycler(linestyle=['-', '--', '-.']))
50 52  
  53 + print("CALCULATING ---------------------------")
51 54  
52 55 for label, stats_ in stats.items():
53 56  
54 57 plt.gca().set_prop_cycle(custom_cycler)
55 58 stats_mean = np.mean(stats_["predictions"], axis=0)
56 59 stats_std = np.std(stats_["predictions"], axis=0)
57   -
  60 +
58 61 #print(label)
59 62 #print(stats_mean)
60 63 #print(stats_std)
... ... @@ -66,7 +69,6 @@
66 69 mu = stats_mean[i]
67 70 variance = stats_std[i] * stats_std[i]
68 71 sigma = stats_std[i]
69   - # math.sqrt(variance)
70 72 print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")
71 73  
72 74 #x_values = np.arange(-1, 5, 0.1)
... ... @@ -75,7 +77,7 @@
75 77 #y = scipy.stats.norm.pdf(x,mean,std)
76 78  
77 79 #plt.plot(x_values, y_values.pdf(x_values,))
78   -
  80 +
79 81 #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True)
80 82 x = np.linspace(0, 1, 1000)
81 83 #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000)
82 84  
83 85  
84 86  
85 87  
... ... @@ -93,21 +95,68 @@
93 95 print("Decisions")
94 96  
95 97  
96   -def pred_distribution_wt_sel(predictions: str, labels: str, labelencoder: str, outdir: str):
  98 +def pred_distribution_wt_sel(predictions: str, n: int, labels: str, labelencoder: str, outdir: str):
97 99  
  100 + '''
  101 + Distribution of the predictions with selection process.
  102 +
  103 + 1) For each dimension, select the n individus with the maximum values for the focused dimension.
  104 + We name S_i the set of n selected individus for the dimension i.
  105 + 2) For each subset S_i, we plot the distribution of each dimension.
  106 + '''
  107 +
  108 + le = None
  109 + with open(args.labelencoder, "rb") as f:
  110 + le = pickle.load(f)
  111 +
98 112 keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions)
99   - n = 3
100   - print(matrix_preds.shape)
  113 +
  114 + colors = [
  115 + "darkorange",
  116 + "red",
  117 + "blue"
  118 + ]
  119 + custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
  120 + cycler(linestyle=['-', '--', '-.']))
  121 +
  122 + kwargs = dict(alpha=0.5)
  123 +
  124 + stats_of = open(os.path.join(args.outdir, f"stats.txt"), "w")
101 125 for j in range(matrix_preds.shape[1]):
  126 +
  127 + label_focused = le.inverse_transform([j])[0]
102 128 indices = (-matrix_preds[:, j]).argsort()[:n]
103   - print(f"INDICE: {j}")
104   - print("indices")
105   - print(indices)
106   - print("Best values")
107   - print(matrix_preds[indices, j])
108   - print("All dimensions of best values")
109   - print(matrix_preds[indices])
110   - # Select the n best for each column
  129 +
  130 + print(f"LABEL: {label_focused}", file=stats_of)
  131 + print(f"INDICE: {j}", file=stats_of)
  132 + print("indices", file=stats_of)
  133 + print(indices, file=stats_of)
  134 + print("Best values", file=stats_of)
  135 + print(matrix_preds[indices, j], file=stats_of)
  136 + print("All dimensions of best values", file=stats_of)
  137 + print(matrix_preds[indices], file=stats_of)
  138 +
  139 + # Use it to build a plot.
  140 + pred_ = matrix_preds[indices]
  141 + stats_mean = np.mean(pred_, axis=0)
  142 + stats_std = np.std(pred_, axis=0)
  143 + for i in range(matrix_preds.shape[1]):
  144 + label_str = le.inverse_transform([i])[0]
  145 + mu = stats_mean[i]
  146 + variance = stats_std[i] * stats_std[i]
  147 + sigma = stats_std[i]
  148 +
  149 + print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")
  150 +
  151 + x = np.linspace(0, 1, 1000)
  152 +
  153 + P = scipy.stats.norm.cdf(x, mu, sigma)
  154 + plt.plot(x, P, label=label_str, **kwargs)
  155 +
  156 + plt.legend()
  157 + plt.savefig(os.path.join(args.outdir, f"{label_focused}_prediction_cdf.pdf"))
  158 + plt.clf()
  159 + stats_of.close()
111 160 pass
112 161  
113 162  
114 163  
... ... @@ -149,10 +198,12 @@
149 198 # pred-distribution-with-selection
150 199 parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.")
151 200 parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True)
  201 + parser_pred_dist_wt_sel.add_argument("-n", type=int, help="Number of maximum selected for each prediction y_i.")
152 202 parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True)
153 203 parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
154 204 parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True)
155 205 parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection")
  206 +
156 207 # duration-stats
157 208 parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur")
158 209 parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True)