Adding n argument to pred_distribution_wt_sel

Adding some comments

Adding n argument to pred_distribution_wt_sel
Adding some comments
quillotm
1 parent 78b39d22dd
Showing 1 changed file with 71 additions and 20 deletions Side-by-side Diff
volia/stats.py
@@ -10,12 +10,18 @@
 import matplotlib.pyplot as plt
 import matplotlib.colors as mcolors
 from utils import SubCommandRunner
-
-
 from cycler import cycler
  
+
 def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str):
+    '''
+    Distribution of the prediction.
  
+    For each label, we plot the distribution of the class predicted.
+    For example, for each character, we plot the distribution of the characters predicted.
+    Another example, for each speaker, we plot the distribution of the characters predicted.
+
+    '''
     predictions = core.data.read_id_values(args.predictions, float)
     labels = core.data.read_labels(args.labels)
  
  
@@ -35,11 +41,7 @@
         else:
             stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1
             stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0)
-    
  
-    print("CALCULATING ---------------------------")
-    
-
     colors = [
         "darkorange",
         "red",
  
@@ -48,13 +50,14 @@
     custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
         cycler(linestyle=['-', '--', '-.']))
  
+    print("CALCULATING ---------------------------")
  
     for label, stats_ in stats.items():
  
         plt.gca().set_prop_cycle(custom_cycler)
         stats_mean = np.mean(stats_["predictions"], axis=0)
         stats_std = np.std(stats_["predictions"], axis=0)
-        
+
         #print(label)
         #print(stats_mean)
         #print(stats_std)
@@ -66,7 +69,6 @@
             mu = stats_mean[i]
             variance = stats_std[i] * stats_std[i]
             sigma = stats_std[i]
-            # math.sqrt(variance)
             print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")
  
             #x_values = np.arange(-1, 5, 0.1)
@@ -75,7 +77,7 @@
             #y = scipy.stats.norm.pdf(x,mean,std)
  
             #plt.plot(x_values, y_values.pdf(x_values,))
-            
+
             #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True)
             x = np.linspace(0, 1, 1000)
             #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000)
  
  
  
  
@@ -93,21 +95,68 @@
     print("Decisions")
  
  
-def pred_distribution_wt_sel(predictions: str, labels: str, labelencoder: str, outdir: str):
+def pred_distribution_wt_sel(predictions: str, n: int, labels: str, labelencoder: str, outdir: str):
  
+    '''
+    Distribution of the predictions with selection process.
+
+    1) For each dimension, select the n individus with the maximum values for the focused dimension.
+    We name S_i the set of n selected individus for the dimension i.
+    2) For each subset S_i, we plot the distribution of each dimension.
+    '''
+
+    le = None
+    with open(args.labelencoder, "rb") as f:
+        le = pickle.load(f)
+
     keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions)
-    n = 3
-    print(matrix_preds.shape)
+
+    colors = [
+        "darkorange",
+        "red",
+        "blue"
+    ]
+    custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
+        cycler(linestyle=['-', '--', '-.']))
+
+    kwargs = dict(alpha=0.5)
+
+    stats_of = open(os.path.join(args.outdir, f"stats.txt"), "w")
     for j in range(matrix_preds.shape[1]):
+
+        label_focused = le.inverse_transform([j])[0]
         indices = (-matrix_preds[:, j]).argsort()[:n]
-        print(f"INDICE: {j}")
-        print("indices")
-        print(indices)
-        print("Best values")
-        print(matrix_preds[indices, j])
-        print("All dimensions of best values")
-        print(matrix_preds[indices])
-    # Select the n best for each column
+
+        print(f"LABEL: {label_focused}", file=stats_of)
+        print(f"INDICE: {j}", file=stats_of)
+        print("indices", file=stats_of)
+        print(indices, file=stats_of)
+        print("Best values", file=stats_of)
+        print(matrix_preds[indices, j], file=stats_of)
+        print("All dimensions of best values", file=stats_of)
+        print(matrix_preds[indices], file=stats_of)
+
+        # Use it to build a plot.
+        pred_ = matrix_preds[indices]
+        stats_mean = np.mean(pred_, axis=0)
+        stats_std = np.std(pred_, axis=0)
+        for i in range(matrix_preds.shape[1]):
+            label_str = le.inverse_transform([i])[0]
+            mu = stats_mean[i]
+            variance = stats_std[i] * stats_std[i]
+            sigma = stats_std[i]
+
+            print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")
+
+            x = np.linspace(0, 1, 1000)
+
+            P = scipy.stats.norm.cdf(x, mu, sigma)
+            plt.plot(x, P, label=label_str, **kwargs)
+
+        plt.legend()
+        plt.savefig(os.path.join(args.outdir, f"{label_focused}_prediction_cdf.pdf"))
+        plt.clf()
+    stats_of.close()
     pass
  
  
  
@@ -149,10 +198,12 @@
     # pred-distribution-with-selection
     parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.")
     parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True)
+    parser_pred_dist_wt_sel.add_argument("-n", type=int, help="Number of maximum selected for each prediction y_i.")
     parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True)
     parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
     parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True)
     parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection")
+
     # duration-stats
     parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur")
     parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True)
...	...	@@ -10,12 +10,18 @@
10	10	import matplotlib.pyplot as plt
11	11	import matplotlib.colors as mcolors
12	12	from utils import SubCommandRunner
13		-
14		-
15	13	from cycler import cycler
16	14
	15	+
17	16	def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str):
	17	+ '''
	18	+ Distribution of the prediction.
18	19
	20	+ For each label, we plot the distribution of the class predicted.
	21	+ For example, for each character, we plot the distribution of the characters predicted.
	22	+ Another example, for each speaker, we plot the distribution of the characters predicted.
	23	+
	24	+ '''
19	25	predictions = core.data.read_id_values(args.predictions, float)
20	26	labels = core.data.read_labels(args.labels)
21	27
22	28
...	...	@@ -35,11 +41,7 @@
35	41	else:
36	42	stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1
37	43	stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0)
38		-
39	44
40		- print("CALCULATING ---------------------------")
41		-
42		-
43	45	colors = [
44	46	"darkorange",
45	47	"red",
46	48
...	...	@@ -48,13 +50,14 @@
48	50	custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
49	51	cycler(linestyle=['-', '--', '-.']))
50	52
	53	+ print("CALCULATING ---------------------------")
51	54
52	55	for label, stats_ in stats.items():
53	56
54	57	plt.gca().set_prop_cycle(custom_cycler)
55	58	stats_mean = np.mean(stats_["predictions"], axis=0)
56	59	stats_std = np.std(stats_["predictions"], axis=0)
57		-
	60	+
58	61	#print(label)
59	62	#print(stats_mean)
60	63	#print(stats_std)
...	...	@@ -66,7 +69,6 @@
66	69	mu = stats_mean[i]
67	70	variance = stats_std[i] * stats_std[i]
68	71	sigma = stats_std[i]
69		- # math.sqrt(variance)
70	72	print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")
71	73
72	74	#x_values = np.arange(-1, 5, 0.1)
...	...	@@ -75,7 +77,7 @@
75	77	#y = scipy.stats.norm.pdf(x,mean,std)
76	78
77	79	#plt.plot(x_values, y_values.pdf(x_values,))
78		-
	80	+
79	81	#x, step = np.linspace(mu - 3sigma, mu + 3sigma, 1000, retstep=True)
80	82	x = np.linspace(0, 1, 1000)
81	83	#x = np.linspace(mu - 3sigma, mu + 3sigma, 1000)
82	84
83	85
84	86
85	87
...	...	@@ -93,21 +95,68 @@
93	95	print("Decisions")
94	96
95	97
96		-def pred_distribution_wt_sel(predictions: str, labels: str, labelencoder: str, outdir: str):
	98	+def pred_distribution_wt_sel(predictions: str, n: int, labels: str, labelencoder: str, outdir: str):
97	99
	100	+ '''
	101	+ Distribution of the predictions with selection process.
	102	+
	103	+ 1) For each dimension, select the n individus with the maximum values for the focused dimension.
	104	+ We name S_i the set of n selected individus for the dimension i.
	105	+ 2) For each subset S_i, we plot the distribution of each dimension.
	106	+ '''
	107	+
	108	+ le = None
	109	+ with open(args.labelencoder, "rb") as f:
	110	+ le = pickle.load(f)
	111	+
98	112	keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions)
99		- n = 3
100		- print(matrix_preds.shape)
	113	+
	114	+ colors = [
	115	+ "darkorange",
	116	+ "red",
	117	+ "blue"
	118	+ ]
	119	+ custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
	120	+ cycler(linestyle=['-', '--', '-.']))
	121	+
	122	+ kwargs = dict(alpha=0.5)
	123	+
	124	+ stats_of = open(os.path.join(args.outdir, f"stats.txt"), "w")
101	125	for j in range(matrix_preds.shape[1]):
	126	+
	127	+ label_focused = le.inverse_transform([j])[0]
102	128	indices = (-matrix_preds[:, j]).argsort()[:n]
103		- print(f"INDICE: {j}")
104		- print("indices")
105		- print(indices)
106		- print("Best values")
107		- print(matrix_preds[indices, j])
108		- print("All dimensions of best values")
109		- print(matrix_preds[indices])
110		- # Select the n best for each column
	129	+
	130	+ print(f"LABEL: {label_focused}", file=stats_of)
	131	+ print(f"INDICE: {j}", file=stats_of)
	132	+ print("indices", file=stats_of)
	133	+ print(indices, file=stats_of)
	134	+ print("Best values", file=stats_of)
	135	+ print(matrix_preds[indices, j], file=stats_of)
	136	+ print("All dimensions of best values", file=stats_of)
	137	+ print(matrix_preds[indices], file=stats_of)
	138	+
	139	+ # Use it to build a plot.
	140	+ pred_ = matrix_preds[indices]
	141	+ stats_mean = np.mean(pred_, axis=0)
	142	+ stats_std = np.std(pred_, axis=0)
	143	+ for i in range(matrix_preds.shape[1]):
	144	+ label_str = le.inverse_transform([i])[0]
	145	+ mu = stats_mean[i]
	146	+ variance = stats_std[i] * stats_std[i]
	147	+ sigma = stats_std[i]
	148	+
	149	+ print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")
	150	+
	151	+ x = np.linspace(0, 1, 1000)
	152	+
	153	+ P = scipy.stats.norm.cdf(x, mu, sigma)
	154	+ plt.plot(x, P, label=label_str, **kwargs)
	155	+
	156	+ plt.legend()
	157	+ plt.savefig(os.path.join(args.outdir, f"{label_focused}_prediction_cdf.pdf"))
	158	+ plt.clf()
	159	+ stats_of.close()
111	160	pass
112	161
113	162
114	163
...	...	@@ -149,10 +198,12 @@
149	198	# pred-distribution-with-selection
150	199	parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.")
151	200	parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True)
	201	+ parser_pred_dist_wt_sel.add_argument("-n", type=int, help="Number of maximum selected for each prediction y_i.")
152	202	parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True)
153	203	parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
154	204	parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True)
155	205	parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection")
	206	+
156	207	# duration-stats
157	208	parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur")
158	209	parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True)