stats.py 7.85 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223


import argparse

import os
import core.data
import math
import numpy as np
import scipy.stats
import pickle
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from utils import SubCommandRunner
from cycler import cycler


def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str):
    '''
    Distribution of the prediction.

    For each label, we plot the distribution of the class predicted.
    For example, for each character, we plot the distribution of the characters predicted.
    Another example, for each speaker, we plot the distribution of the characters predicted.

    '''
    predictions = core.data.read_id_values(args.predictions, float)
    labels = core.data.read_labels(args.labels)

    le = None
    with open(args.labelencoder, "rb") as f:
        le = pickle.load(f)
    stats = {}

    print("PREDICTIONS ---------------------------")
    for id_, predictions_ in predictions.items():
        label = labels[id_][0]
        if label not in stats:
            stats[label] = {
                "nb_utt": 1,
                "predictions": np.expand_dims(predictions_, axis=0)
            }
        else:
            stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1
            stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0)

    colors = [
        "darkorange",
        "red",
        "blue"
    ]
    custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
        cycler(linestyle=['-', '--', '-.']))

    print("CALCULATING ---------------------------")
    
    for label, stats_ in stats.items():

        plt.gca().set_prop_cycle(custom_cycler)
        stats_mean = np.mean(stats_["predictions"], axis=0)
        stats_std = np.std(stats_["predictions"], axis=0)

        #print(label)
        #print(stats_mean)
        #print(stats_std)
        kwargs = dict(alpha=0.5)
        
        for i in range(stats_["predictions"].shape[1]):
            label_str = le.inverse_transform([i])[0]
            #plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs)
            mu = stats_mean[i]
            variance = stats_std[i] * stats_std[i]
            sigma = stats_std[i]
            print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")

            #x_values = np.arange(-1, 5, 0.1)

            #y_values = scipy.stats.norm(mu, variance)
            #y = scipy.stats.norm.pdf(x,mean,std)

            #plt.plot(x_values, y_values.pdf(x_values,))

            #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True)
            x = np.linspace(0, 1, 1000)
            #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000)
            #x, step = np.linspace(0, 1, 1000, retstep=True)
            
            P = scipy.stats.norm.cdf(x, mu, sigma)
            #print(step)
            plt.plot(x, P, label=label_str, **kwargs)
            #plt.savefig("simple_gaussian.pdf")
            
        plt.legend()
        plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf"))
        plt.clf()
    
    print("Decisions")


def pred_distribution_wt_sel(predictions: str, n: int, labels: str, labelencoder: str, outdir: str):

    '''
    Distribution of the predictions with selection process.

    1) For each dimension, select the n individus with the maximum values for the focused dimension.
    We name S_i the set of n selected individus for the dimension i.
    2) For each subset S_i, we plot the distribution of each dimension.
    '''

    le = None
    with open(args.labelencoder, "rb") as f:
        le = pickle.load(f)

    keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions)

    colors = [
        "darkorange",
        "red",
        "blue"
    ]
    custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
        cycler(linestyle=['-', '--', '-.']))

    kwargs = dict(alpha=0.5)

    stats_of = open(os.path.join(args.outdir, f"stats.txt"), "w")
    for j in range(matrix_preds.shape[1]):

        label_focused = le.inverse_transform([j])[0]
        indices = (-matrix_preds[:, j]).argsort()[:n]

        print(f"LABEL: {label_focused}", file=stats_of)
        print(f"INDICE: {j}", file=stats_of)
        print("indices", file=stats_of)
        print(indices, file=stats_of)
        print("Best values", file=stats_of)
        print(matrix_preds[indices, j], file=stats_of)
        print("All dimensions of best values", file=stats_of)
        print(matrix_preds[indices], file=stats_of)

        # Use it to build a plot.
        pred_ = matrix_preds[indices]
        stats_mean = np.mean(pred_, axis=0)
        stats_std = np.std(pred_, axis=0)
        for i in range(matrix_preds.shape[1]):
            label_str = le.inverse_transform([i])[0]
            mu = stats_mean[i]
            variance = stats_std[i] * stats_std[i]
            sigma = stats_std[i]

            print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")

            x = np.linspace(0, 1, 1000)

            P = scipy.stats.norm.cdf(x, mu, sigma)
            plt.plot(x, P, label=label_str, **kwargs)

        plt.legend()
        plt.savefig(os.path.join(args.outdir, f"{label_focused}_prediction_cdf.pdf"))
        plt.clf()
    stats_of.close()
    pass


def utt2dur(utt2dur: str, labels: str):
    if labels == None:
        pass
    else:
        pass

    durations = []
    with open(utt2dur, "r") as f:
        for line in f:
            splited = line.replace("\n", "").split(" ")
            durations.append(float(splited[1]))
    
    durations = np.asarray(durations, dtype=float)
    print(durations.shape)
    mean = np.mean(durations)
    std = np.std(durations)

    print(f"mean: {mean}")
    print(f"std: {std}")


if __name__ == "__main__":

    # Parser
    parser = argparse.ArgumentParser(description="Statistics")
    subparsers = parser.add_subparsers(title="actions")

    # pred-distribution
    parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels")
    parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True)
    parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True)
    parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
    parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True)
    parser_pred_dist.set_defaults(which="pred_distribution")

    # pred-distribution-with-selection
    parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.")
    parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True)
    parser_pred_dist_wt_sel.add_argument("-n", type=int, help="Number of maximum selected for each prediction y_i.", required=True)
    parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True)
    parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
    parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True)
    parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection")

    # duration-stats
    parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur")
    parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True)
    parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file")
    parser_utt2dur.set_defaults(which="utt2dur")

    # Parse
    args = parser.parse_args()

    # Run commands
    runner = SubCommandRunner({
        "pred_distribution": pred_distribution,
        "pred_distribution_with_selection": pred_distribution_wt_sel,
        "utt2dur": utt2dur
    })

    runner.run(args.which, args.__dict__, remove="which")