Quillot Mathias / volia

Blame view

volia/stats.py 3.34 KB
  
  import argparse
  
  import os
  import core.data
  import math
  import numpy as np
  import scipy.stats
  import pickle
  import matplotlib.pyplot as plt
  import matplotlib.colors as mcolors
  
  
  
  from cycler import cycler
  
  def stats():
      print("Decisions")
  
  
  print(list(mcolors.TABLEAU_COLORS))
  
  
  if __name__ == "__main__":
  
      # Parser
      parser = argparse.ArgumentParser(description="")
  
      # Arguments
      parser.add_argument("--predictions", type=str, help="prediction file", required=True)
      parser.add_argument("--labels", type=str, help="label file", required=True)
      parser.add_argument("--labelencoder", type=str, help="label encode pickle file", required=True)
      parser.add_argument("--outdir", type=str, help="output file", required=True)
      
      args = parser.parse_args()
  
      predictions = core.data.read_id_values(args.predictions, float)
      labels = core.data.read_labels(args.labels)
  
      le = None
      with open(args.labelencoder, "rb") as f:
          le = pickle.load(f)
      stats = {}
  
      print("PREDICTIONS ---------------------------")
      for id_, predictions_ in predictions.items():
          label = labels[id_][0]
          if label not in stats:
              stats[label] = {
                  "nb_utt": 1,
                  "predictions": np.expand_dims(predictions_, axis=0)
              }
          else:
              stats[label]["nb_utt"] = stats[label]["nb_utt"] + 1
              stats[label]["predictions"] = np.append(stats[label]["predictions"], np.expand_dims(predictions_, axis=0), axis=0)
      
  
      print("CALCULATING ---------------------------")
      
  
      colors = [
          "darkorange",
          "red",
          "blue"
      ]
      custom_cycler = (cycler(color=list(mcolors.TABLEAU_COLORS)) *
          cycler(linestyle=['-', '--', '-.']))
  
      
      for label, stats_ in stats.items():
  
          plt.gca().set_prop_cycle(custom_cycler)
          stats_mean = np.mean(stats_["predictions"], axis=0)
          stats_std = np.std(stats_["predictions"], axis=0)
          
          #print(label)
          #print(stats_mean)
          #print(stats_std)
          kwargs = dict(alpha=0.5)
          
          for i in range(stats_["predictions"].shape[1]):
              label_str = le.inverse_transform([i])[0]
              #plt.hist(stats_["predictions"][:, i], bins=10, label=label_str, **kwargs)
              mu = stats_mean[i]
              variance = stats_std[i] * stats_std[i]
              sigma = stats_std[i]
              # math.sqrt(variance)
              print(f"{i}: mu {mu}, var {variance}, sigma {sigma}")
  
              #x_values = np.arange(-1, 5, 0.1)
  
              #y_values = scipy.stats.norm(mu, variance)
              #y = scipy.stats.norm.pdf(x,mean,std)
  
              #plt.plot(x_values, y_values.pdf(x_values,))
              
              #x, step = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000, retstep=True)
              x = np.linspace(0, 1, 1000)
              #x = np.linspace(mu - 3*sigma, mu + 3*sigma, 1000)
              #x, step = np.linspace(0, 1, 1000, retstep=True)
              
              P = scipy.stats.norm.cdf(x, mu, sigma)
              #print(step)
              plt.plot(x, P, label=label_str, **kwargs)
              #plt.savefig("simple_gaussian.pdf")
              
          plt.legend()
          plt.savefig(os.path.join(args.outdir, f"{label}_prediction_cdf.pdf"))
          plt.clf()
      
  
      # TODO: 
      # One graph for each label. Distribution of their predictions output are displayed.