Quillot Mathias / volia

1

2

import argparse

2

import argparse

3

4

import os

4

import os

5

import core.data

5

import core.data

6

import math

6

import math

7

import numpy as np

7

import numpy as np

8

import scipy.stats

8

import scipy.stats

9

import pickle

9

import pickle

10

import matplotlib.pyplot as plt

10

import matplotlib.pyplot as plt

11

import matplotlib.colors as mcolors

11

import matplotlib.colors as mcolors

12

from utils import SubCommandRunner

12

from utils import SubCommandRunner

13

from cycler import cycler

13

from cycler import cycler

14

15

16

def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str):

16

def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str):

17

'''

17

'''

18

Distribution of the prediction.

18

Distribution of the prediction.

19

20

For each label, we plot the distribution of the class predicted.

20

For each label, we plot the distribution of the class predicted.

21

For example, for each character, we plot the distribution of the characters predicted.

21

For example, for each character, we plot the distribution of the characters predicted.

22

Another example, for each speaker, we plot the distribution of the characters predicted.

22

Another example, for each speaker, we plot the distribution of the characters predicted.

23

24

'''

24

'''

25

predictions = core.data.read_id_values(args.predictions, float)

25

predictions = core.data.read_id_values(args.predictions, float)

26

labels = core.data.read_labels(args.labels)

26

labels = core.data.read_labels(args.labels)

27

28

le = None

28

le = None

29

with open(args.labelencoder, "rb") as f:

29

with open(args.labelencoder, "rb") as f:

30

le = pickle.load(f)

30

le = pickle.load(f)

31

stats = {}

31

stats = {}

32

33

print("PREDICTIONS ---------------------------")

33

print("PREDICTIONS ---------------------------")

34

for id_, predictions_ in predictions.items():

34

for id_, predictions_ in predictions.items():

35

label = labels[id_][0]

35

label = labels[id_][0]

36

if label not in stats:

36

if label not in stats:

37

stats[label] = {

37

stats[label] = {

38

"nb_utt": 1,

38

"nb_utt": 1,

39

"predictions": np.expand_dims(predictions_, axis=0)

39

"predictions": np.expand_dims(predictions_, axis=0)

40

}

40

}

41

else:

41

else:

42

stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1

42

stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1

43

stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0)

43

stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0)

44

45

colors = [

45

colors = [

46

"darkorange",

46

"darkorange",

47

"red",

47

"red",

48

"blue"

48

"blue"

49

]

49

]

50

custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *

50

custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *

51

cycler(linestyle=['-', '--', '-.']))

51

cycler(linestyle=['-', '--', '-.']))

52

53

print("CALCULATING ---------------------------")

53

print("CALCULATING ---------------------------")

54

55

for label, stats_ in stats.items():

55

for label, stats_ in stats.items():

56

57

plt.gca().set_prop_cycle(custom_cycler)

57

plt.gca().set_prop_cycle(custom_cycler)

58

stats_mean = np.mean(stats_["predictions"], axis=0)

58

stats_mean = np.mean(stats_["predictions"], axis=0)

59

stats_std = np.std(stats_["predictions"], axis=0)

59

stats_std = np.std(stats_["predictions"], axis=0)

60

61

#print(label)

61

#print(label)

62

#print(stats_mean)

62

#print(stats_mean)

63

#print(stats_std)

63

#print(stats_std)

64

kwargs = dict(alpha=0.5)

64

kwargs = dict(alpha=0.5)

65

66

for i in range(stats_["predictions"].shape[1]):

66

for i in range(stats_["predictions"].shape[1]):

67

label_str = le.inverse_transform([i])[0]

67

label_str = le.inverse_transform([i])[0]

68

#plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs)

68

#plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs)

69

mu = stats_mean[i]

69

mu = stats_mean[i]

70

variance = stats_std[i] * stats_std[i]

70

variance = stats_std[i] * stats_std[i]

71

sigma = stats_std[i]

71

sigma = stats_std[i]

72

print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")

72

print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")

73

74

#x_values = np.arange(-1, 5, 0.1)

74

#x_values = np.arange(-1, 5, 0.1)

75

76

#y_values = scipy.stats.norm(mu, variance)

76

#y_values = scipy.stats.norm(mu, variance)

77

#y = scipy.stats.norm.pdf(x,mean,std)

77

#y = scipy.stats.norm.pdf(x,mean,std)

78

79

#plt.plot(x_values, y_values.pdf(x_values,))

79

#plt.plot(x_values, y_values.pdf(x_values,))

80

81

#x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True)

81

#x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True)

82

x = np.linspace(0, 1, 1000)

82

x = np.linspace(0, 1, 1000)

83

#x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000)

83

#x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000)

84

#x, step = np.linspace(0, 1, 1000, retstep=True)

84

#x, step = np.linspace(0, 1, 1000, retstep=True)

85

86

P = scipy.stats.norm.cdf(x, mu, sigma)

86

P = scipy.stats.norm.cdf(x, mu, sigma)

87

#print(step)

87

#print(step)

88

plt.plot(x, P, label=label_str, **kwargs)

88

plt.plot(x, P, label=label_str, **kwargs)

89

#plt.savefig("simple_gaussian.pdf")

89

#plt.savefig("simple_gaussian.pdf")

90

91

plt.legend()

91

plt.legend()

92

plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf"))

92

plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf"))

93

plt.clf()

93

plt.clf()

94

95

print("Decisions")

95

print("Decisions")

96

97

98

def pred_distribution_wt_sel(predictions: str, n: int, labels: str, labelencoder: str, outdir: str):

98

def pred_distribution_wt_sel(predictions: str, n: int, labels: str, labelencoder: str, outdir: str):

99

100

'''

100

'''

101

Distribution of the predictions with selection process.

101

Distribution of the predictions with selection process.

102

103

1) For each dimension, select the n individus with the maximum values for the focused dimension.

103

1) For each dimension, select the n individus with the maximum values for the focused dimension.

104

We name S_i the set of n selected individus for the dimension i.

104

We name S_i the set of n selected individus for the dimension i.

105

2) For each subset S_i, we plot the distribution of each dimension.

105

2) For each subset S_i, we plot the distribution of each dimension.

106

'''

106

'''

107

108

le = None

108

le = None

109

with open(args.labelencoder, "rb") as f:

109

with open(args.labelencoder, "rb") as f:

110

le = pickle.load(f)

110

le = pickle.load(f)

111

112

keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions)

112

keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions)

113

114

colors = [

114

colors = [

115

"darkorange",

115

"darkorange",

116

"red",

116

"red",

117

"blue"

117

"blue"

118

]

118

]

119

custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *

119

custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *

120

cycler(linestyle=['-', '--', '-.']))

120

cycler(linestyle=['-', '--', '-.']))

121

122

kwargs = dict(alpha=0.5)

122

kwargs = dict(alpha=0.5)

123

124

stats_of = open(os.path.join(args.outdir, f"stats.txt"), "w")

124

stats_of = open(os.path.join(args.outdir, f"stats.txt"), "w")

125

for j in range(matrix_preds.shape[1]):

125

for j in range(matrix_preds.shape[1]):

126

127

label_focused = le.inverse_transform([j])[0]

127

label_focused = le.inverse_transform([j])[0]

128

indices = (-matrix_preds[:, j]).argsort()[:n]

128

indices = (-matrix_preds[:, j]).argsort()[:n]

129

130

print(f"LABEL: {label_focused}", file=stats_of)

130

print(f"LABEL: {label_focused}", file=stats_of)

131

print(f"INDICE: {j}", file=stats_of)

131

print(f"INDICE: {j}", file=stats_of)

132

print("indices", file=stats_of)

132

print("indices", file=stats_of)

133

print(indices, file=stats_of)

133

print(indices, file=stats_of)

134

print("Best values", file=stats_of)

134

print("Best values", file=stats_of)

135

print(matrix_preds[indices, j], file=stats_of)

135

print(matrix_preds[indices, j], file=stats_of)

136

print("All dimensions of best values", file=stats_of)

136

print("All dimensions of best values", file=stats_of)

137

print(matrix_preds[indices], file=stats_of)

137

print(matrix_preds[indices], file=stats_of)

138

139

# Use it to build a plot.

139

# Use it to build a plot.

140

pred_ = matrix_preds[indices]

140

pred_ = matrix_preds[indices]

141

stats_mean = np.mean(pred_, axis=0)

141

stats_mean = np.mean(pred_, axis=0)

142

stats_std = np.std(pred_, axis=0)

142

stats_std = np.std(pred_, axis=0)

143

for i in range(matrix_preds.shape[1]):

143

for i in range(matrix_preds.shape[1]):

144

label_str = le.inverse_transform([i])[0]

144

label_str = le.inverse_transform([i])[0]

145

mu = stats_mean[i]

145

mu = stats_mean[i]

146

variance = stats_std[i] * stats_std[i]

146

variance = stats_std[i] * stats_std[i]

147

sigma = stats_std[i]

147

sigma = stats_std[i]

148

149

print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")

149

print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")

150

151

x = np.linspace(0, 1, 1000)

151

x = np.linspace(0, 1, 1000)

152

153

P = scipy.stats.norm.cdf(x, mu, sigma)

153

P = scipy.stats.norm.cdf(x, mu, sigma)

154

plt.plot(x, P, label=label_str, **kwargs)

154

plt.plot(x, P, label=label_str, **kwargs)

155

156

plt.legend()

156

plt.legend()

157

plt.savefig(os.path.join(args.outdir, f"{label_focused}_prediction_cdf.pdf"))

157

plt.savefig(os.path.join(args.outdir, f"{label_focused}_prediction_cdf.pdf"))

158

plt.clf()

158

plt.clf()

159

stats_of.close()

159

stats_of.close()

160

pass

160

pass

161

162

163

def utt2dur(utt2dur: str, labels: str):

163

def utt2dur(utt2dur: str, labels: str):

164

if labels == None:

164

if labels == None:

165

pass

165

pass

166

else:

166

else:

167

pass

167

pass

168

169

durations = []

169

durations = []

170

with open(utt2dur, "r") as f:

170

with open(utt2dur, "r") as f:

171

for line in f:

171

for line in f:

172

splited = line.replace("\n", "").split(" ")

172

splited = line.replace("\n", "").split(" ")

173

durations.append(float(splited[1]))

173

durations.append(float(splited[1]))

174

175

durations = np.asarray(durations, dtype=float)

175

durations = np.asarray(durations, dtype=float)

176

print(durations.shape)

176

print(durations.shape)

177

mean = np.mean(durations)

177

mean = np.mean(durations)

178

std = np.std(durations)

178

std = np.std(durations)

179

180

print(f"mean: {mean}")

180

print(f"mean: {mean}")

181

print(f"std: {std}")

181

print(f"std: {std}")

182

183

184

if __name__ == "__main__":

184

if __name__ == "__main__":

185

186

# Parser

186

# Parser

187

parser = argparse.ArgumentParser(description="Statistics")

187

parser = argparse.ArgumentParser(description="Statistics")

188

subparsers = parser.add_subparsers(title="actions")

188

subparsers = parser.add_subparsers(title="actions")

189

190

# pred-distribution

190

# pred-distribution

191

parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels")

191

parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels")

192

parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True)

192

parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True)

193

parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True)

193

parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True)

194

parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)

194

parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)

195

parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True)

195

parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True)

196

parser_pred_dist.set_defaults(which="pred_distribution")

196

parser_pred_dist.set_defaults(which="pred_distribution")

197

198

# pred-distribution-with-selection

198

# pred-distribution-with-selection

199

parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.")

199

parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.")

200

parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True)

200

parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True)

201

parser_pred_dist_wt_sel.add_argument("-n", type=int, help="Number of maximum selected for each prediction y_i.")

201

parser_pred_dist_wt_sel.add_argument("-n", type=int, help="Number of maximum selected for each prediction y_i.", required=True)

202

parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True)

202

parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True)

203

parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)

203

parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)

204

parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True)

204

parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True)

205

parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection")

205

parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection")

206

207

# duration-stats

207

# duration-stats

208

parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur")

208

parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur")

209

parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True)

209

parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True)

210

parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file")

210

parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file")

211

parser_utt2dur.set_defaults(which="utt2dur")

211

parser_utt2dur.set_defaults(which="utt2dur")

212

213

# Parse

213

# Parse

214

args = parser.parse_args()

214

args = parser.parse_args()

215

216

# Run commands

216

# Run commands

217

runner = SubCommandRunner({

217

runner = SubCommandRunner({

218

"pred_distribution": pred_distribution,

218

"pred_distribution": pred_distribution,

219

"pred_distribution_with_selection": pred_distribution_wt_sel,

219

"pred_distribution_with_selection": pred_distribution_wt_sel,

220

"utt2dur": utt2dur

220

"utt2dur": utt2dur

221

})

221

})

222

223

runner.run(args.which, args.__dict__, remove="which")

223

runner.run(args.which, args.__dict__, remove="which")

GITLAB

Quillot Mathias / volia

Specifying that "n" argument is required to pred_distribution_wt_sel