Quillot Mathias / volia

1

2

import argparse

2

import argparse

3

4

import os

4

import os

5

import core.data

5

import core.data

6

import math

6

import math

7

import numpy as np

7

import numpy as np

8

import scipy.stats

8

import scipy.stats

9

import pickle

9

import pickle

10

import matplotlib.pyplot as plt

10

import matplotlib.pyplot as plt

11

import matplotlib.colors as mcolors

11

import matplotlib.colors as mcolors

12

from utils import SubCommandRunner

12

from utils import SubCommandRunner

13

14

15

from cycler import cycler

13

from cycler import cycler

16

14

15

17

def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str):

16

def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str):

17

'''

18

Distribution of the prediction.

18

19

20

For each label, we plot the distribution of the class predicted.

21

For example, for each character, we plot the distribution of the characters predicted.

22

Another example, for each speaker, we plot the distribution of the characters predicted.

23

24

'''

19

predictions = core.data.read_id_values(args.predictions, float)

25

predictions = core.data.read_id_values(args.predictions, float)

20

labels = core.data.read_labels(args.labels)

26

labels = core.data.read_labels(args.labels)

21

27

22

le = None

28

le = None

23

with open(args.labelencoder, "rb") as f:

29

with open(args.labelencoder, "rb") as f:

24

le = pickle.load(f)

30

le = pickle.load(f)

25

stats = {}

31

stats = {}

26

32

27

print("PREDICTIONS ---------------------------")

33

print("PREDICTIONS ---------------------------")

28

for id_, predictions_ in predictions.items():

34

for id_, predictions_ in predictions.items():

29

label = labels[id_][0]

35

label = labels[id_][0]

30

if label not in stats:

36

if label not in stats:

31

stats[label] = {

37

stats[label] = {

32

"nb_utt": 1,

38

"nb_utt": 1,

33

"predictions": np.expand_dims(predictions_, axis=0)

39

"predictions": np.expand_dims(predictions_, axis=0)

34

}

40

}

35

else:

41

else:

36

stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1

42

stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1

37

stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0)

43

stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0)

38

39

44

40

print("CALCULATING ---------------------------")

41

42

43

colors = [

45

colors = [

44

"darkorange",

46

"darkorange",

45

"red",

47

"red",

46

"blue"

48

"blue"

47

]

49

]

48

custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *

50

custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *

49

cycler(linestyle=['-', '--', '-.']))

51

cycler(linestyle=['-', '--', '-.']))

50

52

53

print("CALCULATING ---------------------------")

51

54

52

for label, stats_ in stats.items():

55

for label, stats_ in stats.items():

53

56

54

plt.gca().set_prop_cycle(custom_cycler)

57

plt.gca().set_prop_cycle(custom_cycler)

55

stats_mean = np.mean(stats_["predictions"], axis=0)

58

stats_mean = np.mean(stats_["predictions"], axis=0)

56

stats_std = np.std(stats_["predictions"], axis=0)

59

stats_std = np.std(stats_["predictions"], axis=0)

57

60

58

#print(label)

61

#print(label)

59

#print(stats_mean)

62

#print(stats_mean)

60

#print(stats_std)

63

#print(stats_std)

61

kwargs = dict(alpha=0.5)

64

kwargs = dict(alpha=0.5)

62

65

63

for i in range(stats_["predictions"].shape[1]):

66

for i in range(stats_["predictions"].shape[1]):

64

label_str = le.inverse_transform([i])[0]

67

label_str = le.inverse_transform([i])[0]

65

#plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs)

68

#plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs)

66

mu = stats_mean[i]

69

mu = stats_mean[i]

67

variance = stats_std[i] * stats_std[i]

70

variance = stats_std[i] * stats_std[i]

68

sigma = stats_std[i]

71

sigma = stats_std[i]

69

# math.sqrt(variance)

70

print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")

72

print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")

71

73

72

#x_values = np.arange(-1, 5, 0.1)

74

#x_values = np.arange(-1, 5, 0.1)

73

75

74

#y_values = scipy.stats.norm(mu, variance)

76

#y_values = scipy.stats.norm(mu, variance)

75

#y = scipy.stats.norm.pdf(x,mean,std)

77

#y = scipy.stats.norm.pdf(x,mean,std)

76

78

77

#plt.plot(x_values, y_values.pdf(x_values,))

79

#plt.plot(x_values, y_values.pdf(x_values,))

78

80

79

#x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True)

81

#x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True)

80

x = np.linspace(0, 1, 1000)

82

x = np.linspace(0, 1, 1000)

81

#x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000)

83

#x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000)

82

#x, step = np.linspace(0, 1, 1000, retstep=True)

84

#x, step = np.linspace(0, 1, 1000, retstep=True)

83

85

84

P = scipy.stats.norm.cdf(x, mu, sigma)

86

P = scipy.stats.norm.cdf(x, mu, sigma)

85

#print(step)

87

#print(step)

86

plt.plot(x, P, label=label_str, **kwargs)

88

plt.plot(x, P, label=label_str, **kwargs)

87

#plt.savefig("simple_gaussian.pdf")

89

#plt.savefig("simple_gaussian.pdf")

88

90

89

plt.legend()

91

plt.legend()

90

plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf"))

92

plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf"))

91

plt.clf()

93

plt.clf()

92

94

93

print("Decisions")

95

print("Decisions")

94

96

95

97

96

def pred_distribution_wt_sel(predictions: str, labels: str, labelencoder: str, outdir: str):

98

def pred_distribution_wt_sel(predictions: str, n: int, labels: str, labelencoder: str, outdir: str):

97

99

100

'''

101

Distribution of the predictions with selection process.

102

103

1) For each dimension, select the n individus with the maximum values for the focused dimension.

104

We name S_i the set of n selected individus for the dimension i.

105

2) For each subset S_i, we plot the distribution of each dimension.

106

'''

107

108

le = None

109

with open(args.labelencoder, "rb") as f:

110

le = pickle.load(f)

111

98

keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions)

112

keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions)

99

n = 3

113

100

print(matrix_preds.shape)

114

colors = [

115

"darkorange",

116

"red",

117

"blue"

118

]

119

custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *

120

cycler(linestyle=['-', '--', '-.']))

121

122

kwargs = dict(alpha=0.5)

123

124

stats_of = open(os.path.join(args.outdir, f"stats.txt"), "w")

101

for j in range(matrix_preds.shape[1]):

125

for j in range(matrix_preds.shape[1]):

126

127

label_focused = le.inverse_transform([j])[0]

102

indices = (-matrix_preds[:, j]).argsort()[:n]

128

indices = (-matrix_preds[:, j]).argsort()[:n]

103

print(f"INDICE: {j}")

129

104

print("indices")

130

print(f"LABEL: {label_focused}", file=stats_of)

105

print(indices)

131

print(f"INDICE: {j}", file=stats_of)

106

print("Best values")

132

print("indices", file=stats_of)

107

print(matrix_preds[indices, j])

133

print(indices, file=stats_of)

108

print("All dimensions of best values")

134

print("Best values", file=stats_of)

109

print(matrix_preds[indices])

135

print(matrix_preds[indices, j], file=stats_of)

110

# Select the n best for each column

136

print("All dimensions of best values", file=stats_of)

137

print(matrix_preds[indices], file=stats_of)

138

139

# Use it to build a plot.

140

pred_ = matrix_preds[indices]

141

stats_mean = np.mean(pred_, axis=0)

142

stats_std = np.std(pred_, axis=0)

143

for i in range(matrix_preds.shape[1]):

144

label_str = le.inverse_transform([i])[0]

145

mu = stats_mean[i]

146

variance = stats_std[i] * stats_std[i]

147

sigma = stats_std[i]

148

149

print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")

150

151

x = np.linspace(0, 1, 1000)

152

153

P = scipy.stats.norm.cdf(x, mu, sigma)

154

plt.plot(x, P, label=label_str, **kwargs)

155

156

plt.legend()

157

plt.savefig(os.path.join(args.outdir, f"{label_focused}_prediction_cdf.pdf"))

158

plt.clf()

159

stats_of.close()

111

pass

160

pass

112

161

113

162

114

def utt2dur(utt2dur: str, labels: str):

163

def utt2dur(utt2dur: str, labels: str):

115

if labels == None:

164

if labels == None:

116

pass

165

pass

117

else:

166

else:

118

pass

167

pass

119

168

120

durations = []

169

durations = []

121

with open(utt2dur, "r") as f:

170

with open(utt2dur, "r") as f:

122

for line in f:

171

for line in f:

123

splited = line.replace("\n", "").split(" ")

172

splited = line.replace("\n", "").split(" ")

124

durations.append(float(splited[1]))

173

durations.append(float(splited[1]))

125

174

126

durations = np.asarray(durations, dtype=float)

175

durations = np.asarray(durations, dtype=float)

127

print(durations.shape)

176

print(durations.shape)

128

mean = np.mean(durations)

177

mean = np.mean(durations)

129

std = np.std(durations)

178

std = np.std(durations)

130

179

131

print(f"mean: {mean}")

180

print(f"mean: {mean}")

132

print(f"std: {std}")

181

print(f"std: {std}")

133

182

134

183

135

if __name__ == "__main__":

184

if __name__ == "__main__":

136

185

137

# Parser

186

# Parser

138

parser = argparse.ArgumentParser(description="Statistics")

187

parser = argparse.ArgumentParser(description="Statistics")

139

subparsers = parser.add_subparsers(title="actions")

188

subparsers = parser.add_subparsers(title="actions")

140

189

141

# pred-distribution

190

# pred-distribution

142

parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels")

191

parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels")

143

parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True)

192

parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True)

144

parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True)

193

parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True)

145

parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)

194

parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)

146

parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True)

195

parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True)

147

parser_pred_dist.set_defaults(which="pred_distribution")

196

parser_pred_dist.set_defaults(which="pred_distribution")

148

197

149

# pred-distribution-with-selection

198

# pred-distribution-with-selection

150

parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.")

199

parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.")

151

parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True)

200

parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True)

201

parser_pred_dist_wt_sel.add_argument("-n", type=int, help="Number of maximum selected for each prediction y_i.")

152

parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True)

202

parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True)

153

parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)

203

parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)

154

parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True)

204

parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True)

155

parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection")

205

parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection")

206

156

# duration-stats

207

# duration-stats

157

parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur")

208

parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur")

158

parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True)

209

parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True)

159

parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file")

210

parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file")

160

parser_utt2dur.set_defaults(which="utt2dur")

211

parser_utt2dur.set_defaults(which="utt2dur")

161

212

162

# Parse

213

# Parse

163

args = parser.parse_args()

214

args = parser.parse_args()

164

215

165

# Run commands

216

# Run commands

GITLAB

Quillot Mathias / volia

Adding n argument to pred_distribution_wt_sel