stats.py 5.96 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172


import argparse

import os
import core.data
import math
import numpy as np
import scipy.stats
import pickle
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from utils import SubCommandRunner


from cycler import cycler

def pred_distribution(predictions: str, labels: str, labelencoder: str, outdir: str):

    predictions = core.data.read_id_values(args.predictions, float)
    labels = core.data.read_labels(args.labels)

    le = None
    with open(args.labelencoder, "rb") as f:
        le = pickle.load(f)
    stats = {}

    print("PREDICTIONS ---------------------------")
    for id_, predictions_ in predictions.items():
        label = labels[id_][0]
        if label not in stats:
            stats[label] = {
                "nb_utt": 1,
                "predictions": np.expand_dims(predictions_, axis=0)
            }
        else:
            stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1
            stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0)
    

    print("CALCULATING ---------------------------")
    

    colors = [
        "darkorange",
        "red",
        "blue"
    ]
    custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
        cycler(linestyle=['-', '--', '-.']))

    
    for label, stats_ in stats.items():

        plt.gca().set_prop_cycle(custom_cycler)
        stats_mean = np.mean(stats_["predictions"], axis=0)
        stats_std = np.std(stats_["predictions"], axis=0)
        
        #print(label)
        #print(stats_mean)
        #print(stats_std)
        kwargs = dict(alpha=0.5)
        
        for i in range(stats_["predictions"].shape[1]):
            label_str = le.inverse_transform([i])[0]
            #plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs)
            mu = stats_mean[i]
            variance = stats_std[i] * stats_std[i]
            sigma = stats_std[i]
            # math.sqrt(variance)
            print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")

            #x_values = np.arange(-1, 5, 0.1)

            #y_values = scipy.stats.norm(mu, variance)
            #y = scipy.stats.norm.pdf(x,mean,std)

            #plt.plot(x_values, y_values.pdf(x_values,))
            
            #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True)
            x = np.linspace(0, 1, 1000)
            #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000)
            #x, step = np.linspace(0, 1, 1000, retstep=True)
            
            P = scipy.stats.norm.cdf(x, mu, sigma)
            #print(step)
            plt.plot(x, P, label=label_str, **kwargs)
            #plt.savefig("simple_gaussian.pdf")
            
        plt.legend()
        plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf"))
        plt.clf()
    
    print("Decisions")


def pred_distribution_wt_sel(predictions: str, labels: str, labelencoder: str, outdir: str):

    keys_preds, matrix_preds = core.data.read_features_with_matrix(predictions)
    n = 3
    print(matrix_preds.shape)
    for j in range(matrix_preds.shape[1]):
        indices = (-matrix_preds[:, j]).argsort()[:n]
        print(f"INDICE: {j}")
        print("indices")
        print(indices)
        print("Best values")
        print(matrix_preds[indices, j])
        print("All dimensions of best values")
        print(matrix_preds[indices])
    # Select the n best for each column
    pass


def utt2dur(utt2dur: str, labels: str):
    if labels == None:
        pass
    else:
        pass

    durations = []
    with open(utt2dur, "r") as f:
        for line in f:
            splited = line.replace("\n", "").split(" ")
            durations.append(float(splited[1]))
    
    durations = np.asarray(durations, dtype=float)
    print(durations.shape)
    mean = np.mean(durations)
    std = np.std(durations)

    print(f"mean: {mean}")
    print(f"std: {std}")


if __name__ == "__main__":

    # Parser
    parser = argparse.ArgumentParser(description="Statistics")
    subparsers = parser.add_subparsers(title="actions")

    # pred-distribution
    parser_pred_dist = subparsers.add_parser("pred-distribution", help="plot distributions of prediction through labels")
    parser_pred_dist.add_argument("--predictions", type=str, help="prediction file", required=True)
    parser_pred_dist.add_argument("--labels", type=str, help="label file", required=True)
    parser_pred_dist.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
    parser_pred_dist.add_argument("--outdir", type=str, help="output file", required=True)
    parser_pred_dist.set_defaults(which="pred_distribution")

    # pred-distribution-with-selection
    parser_pred_dist_wt_sel = subparsers.add_parser("pred-distribution-with-selection", help="plot distributions of prediction through labels with a selection of the n best records by column/class prediction.")
    parser_pred_dist_wt_sel.add_argument("--predictions", type=str, help="prediction file", required=True)
    parser_pred_dist_wt_sel.add_argument("--labels", type=str, help="label file", required=True)
    parser_pred_dist_wt_sel.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
    parser_pred_dist_wt_sel.add_argument("--outdir", type=str, help="output file", required=True)
    parser_pred_dist_wt_sel.set_defaults(which="pred_distribution_with_selection")
    # duration-stats
    parser_utt2dur = subparsers.add_parser("utt2dur", help="distribution of utt2dur")
    parser_utt2dur.add_argument("--utt2dur", type=str, help="utt2dur file", required=True)
    parser_utt2dur.add_argument("--labels", type=str, default=None, help="labels file")
    parser_utt2dur.set_defaults(which="utt2dur")

    # Parse
    args = parser.parse_args()

    # Run commands
    runner = SubCommandRunner({
        "pred_distribution": pred_distribution,
        "pred_distribution_with_selection": pred_distribution_wt_sel,
        "utt2dur": utt2dur
    })

    runner.run(args.which, args.__dict__, remove="which")