stats.py 3.34 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114


import argparse

import os
import core.data
import math
import numpy as np
import scipy.stats
import pickle
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors


from cycler import cycler

def stats():
    print("Decisions")


print(list(mcolors.TABLEAU_COLORS))


if __name__ == "__main__":

    # Parser
    parser = argparse.ArgumentParser(description="")

    # Arguments
    parser.add_argument("--predictions", type=str, help="prediction file", required=True)
    parser.add_argument("--labels", type=str, help="label file", required=True)
    parser.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
    parser.add_argument("--outdir", type=str, help="output file", required=True)
    
    args = parser.parse_args()

    predictions = core.data.read_id_values(args.predictions, float)
    labels = core.data.read_labels(args.labels)

    le = None
    with open(args.labelencoder, "rb") as f:
        le = pickle.load(f)
    stats = {}

    print("PREDICTIONS ---------------------------")
    for id_, predictions_ in predictions.items():
        label = labels[id_][0]
        if label not in stats:
            stats[label] = {
                "nb_utt": 1,
                "predictions": np.expand_dims(predictions_, axis=0)
            }
        else:
            stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1
            stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0)
    

    print("CALCULATING ---------------------------")
    

    colors = [
        "darkorange",
        "red",
        "blue"
    ]
    custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
        cycler(linestyle=['-', '--', '-.']))

    
    for label, stats_ in stats.items():

        plt.gca().set_prop_cycle(custom_cycler)
        stats_mean = np.mean(stats_["predictions"], axis=0)
        stats_std = np.std(stats_["predictions"], axis=0)
        
        #print(label)
        #print(stats_mean)
        #print(stats_std)
        kwargs = dict(alpha=0.5)
        
        for i in range(stats_["predictions"].shape[1]):
            label_str = le.inverse_transform([i])[0]
            #plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs)
            mu = stats_mean[i]
            variance = stats_std[i] * stats_std[i]
            sigma = stats_std[i]
            # math.sqrt(variance)
            print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")

            #x_values = np.arange(-1, 5, 0.1)

            #y_values = scipy.stats.norm(mu, variance)
            #y = scipy.stats.norm.pdf(x,mean,std)

            #plt.plot(x_values, y_values.pdf(x_values,))
            
            #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True)
            x = np.linspace(0, 1, 1000)
            #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000)
            #x, step = np.linspace(0, 1, 1000, retstep=True)
            
            P = scipy.stats.norm.cdf(x, mu, sigma)
            #print(step)
            plt.plot(x, P, label=label_str, **kwargs)
            #plt.savefig("simple_gaussian.pdf")
            
        plt.legend()
        plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf"))
        plt.clf()
    

    # TODO: 
    # One graph for each label. Distribution of their predictions output are displayed.