Rouvier Mickael / Deft 2017 - Sentiment Analysis

Blame view

bin/results_cnn.py 12.6 KB
  from keras.models import Sequential
  from keras.layers.core import TimeDistributedDense
  from keras.layers.advanced_activations import PReLU
  from keras.layers.normalization import BatchNormalization
  from keras import backend as K
  #from keras.layers.core import Dense, Dropout, Activation, Reshape, Flatten
  from keras.layers import TimeDistributed, Lambda, Input, merge, Bidirectional
  from keras.layers.core import Dense, Activation, Dropout, Flatten
  from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, Conv1D, MaxPooling1D
  #from keras.layers.convolutional import Convolution1D, MaxPooling1D, AveragePooling1D
  from keras.optimizers import SGD, Adadelta, Adam, Adamax, RMSprop
  from keras.models import Model
  #from keras.layers.recurrent import LSTM, GRU, SimpleRNN
  from keras.constraints import maxnorm
  from keras.callbacks import Callback, EarlyStopping
  from keras.preprocessing import sequence
  from keras.layers.embeddings import Embedding
  from keras.regularizers import l2, activity_l2
  from keras import regularizers
  from keras.models import model_from_json
  import theano
  from theano import tensor
  import warnings
  import sys
  import time
  import os
  import numpy as np
  import fileinput
  import math
  
  warnings.filterwarnings("ignore")
  
  def create_class_weight(labels_dict,mu=0.55):
      total = np.sum(labels_dict.values())
      keys = labels_dict.keys()
      class_weight = dict()
  
      for key in keys:
          score = math.log(mu*total/float(labels_dict[key]))
          class_weight[key] = score if score > 1.0 else 1.0
  
      return class_weight
  
  
  def score_deft_2017(pred, gold, task):
      t = ConfusionMatrix()
      for x in range(0, len(gold)):
          a = pred[x].tolist()
          b = gold[x].tolist()
          t.store(a.index(max(a)), b.index(max(b)))
      return t.score_deft_2017(task)
  
  
  def score_semeval_2017(pred, gold):
      t = ConfusionMatrix()
      for x in range(0, len(gold)):
          a = pred[x].tolist()
          b = gold[x].tolist()
          t.store(a.index(max(a)), b.index(max(b)))
      return t.score_semeval_2017()
  
  def score_semeval_2016(pred, gold):
      t = ConfusionMatrix()
      for x in range(0, len(gold)):
          a = pred[x].tolist()
          b = gold[x].tolist()
          t.store(a.index(max(a)), b.index(max(b)))
      return t.score_semeval_2016()
  
  
  
  
  class validation_semeval(Callback):
  
      def __init__(self, dev_files, word_word2vec, maxlen, output, task, patience=5):
          super(Callback, self).__init__()
          self.best_result = -1.0
          self.best_round = 1
          self.wait = 0
          self.output = output
          self.patience = patience
          self.counter = 0
          self.maxlen = maxlen
          self.dev_files = dev_files
          self.word_word2vec = word_word2vec
          self.task = task
  
      
      def on_epoch_end(self, epoch, logs={}):
          self.counter += 1
          current = 0
  
          id_train, X_word_train, y_train = read_sentiment(self.dev_files, self.word_word2vec, self.task)
          X_word_train = sequence.pad_sequences(X_word_train, maxlen=self.maxlen, padding='post', truncating='post')
          #predict = self.model.predict([X_word_train, X_highlevel_train], batch_size=1024, verbose=1)
          predict = self.model.predict(X_word_train, batch_size=1024, verbose=1)
          current = score_deft_2017(predict, y_train, self.task)
              
          if self.best_result < current:
              self.best_result = current
              self.best_round = self.counter
              self.wait = 0
  
              json_string = self.model.to_json()
              open(self.output+".json", 'w').write(json_string)
              self.model.save_weights(self.output+".h5", overwrite=True)
  
  
          print "
  
  "
          print "Dev score 2017 : "+str(current)
          print "Best score 2017 : "+str(self.best_result)
          print "
  
  "
  
  
  
  class ConfusionMatrix(object):
  
      def __init__(self):
          self.h = {}
          self.total = 0
  
      def store(self, actual, truth):
          if actual not in self.h.keys():
              self.h[ actual ] = {"tp": 0, "tn": 0, "fp": 0, "fn": 0}
          if truth not in self.h.keys():
              self.h[ truth ] = {"tp": 0, "tn": 0, "fp": 0, "fn": 0}
  
          if actual == truth:
              self.h[actual]["tp"] += 1
          else:
              self.h[actual]["fp"] += 1
              self.h[truth]["fn"] += 1
              self.h[truth]["tn"] += 1
              self.total += 1
  
      def recall(self, name):
          t = self.h[name]["tp"] + self.h[name]["fn"]
      
          if t == 0:
              return 0
          return ( (float)(self.h[name]["tp"]) / (float)( t ) )
  
  
      def precision(self, name):
          t = self.h[name]["tp"] + self.h[name]["fp"]
          if t == 0:
              return 0
          return ( (float)(self.h[name]["tp"]) / (float)( t ) )
  
  
      def fscore(self, name):
          t = self.precision(name) + self.recall(name)
          if t == 0:
              return 0
          return (2 * self.precision(name) * self.recall(name) ) / (t )
  
      def score_semeval_2016(self):
          return ( self.fscore(0) + self.fscore(1) ) / 2
  
      def score_semeval_2017(self):
          return ( self.recall(0) + self.recall(1) + self.recall(2)  ) / 3
  
      def score_deft_2017(self, task):
          print self.h
  
          if task == "task1":
              print "negative : "+str(self.fscore(0))
              print "positive : "+str(self.fscore(1))
              print "objective : "+str(self.fscore(2))
              print "mixed : "+str(self.fscore(3))
              return (self.fscore(0) + self.fscore(1) + self.fscore(2) + self.fscore(3) ) / 4
  
          if task == "task3":
              print "negative : "+str(self.fscore(0))
              print "positive : "+str(self.fscore(1))
              print "objective : "+str(self.fscore(2))
              print "mixed : "+str(self.fscore(3))
              return (self.fscore(0) + self.fscore(1) + self.fscore(2) + self.fscore(3) ) / 4
  
          if task == "task2":
              print "figurative : "+str(self.fscore(0))
              print "nonfigurative : "+str(self.fscore(1))
              return (self.fscore(0) + self.fscore(1) ) / 2
  
  
  
      def info(self):
          print self.h
  
  
  def read_word2vec(embedding_file):
      ar = []
      dico = {}
      size = 0
      counter = 0
  
      with open(embedding_file) as f:
          for line in f:
              line = line.strip()
              line = line.split(" ")
              if len(line) > 3:
                  ar.append( map(float, line[1:] ) )
                  dico[  line[0] ] = counter
                  counter += 1
              if len(line) < 3:
                  size = int(line[1])
      
      ar = np.array(ar, dtype='float')
  
      return counter, size, dico, ar
  
  def read_sentiment_train(sentiment_file, word_word2vec, task):
      X_word = []
      Y = []
      Z = []
  
      Y_COUNTER = None
      if task == "task1":
          Y_COUNTER = [0.0]*4
      if task == "task2":
          Y_COUNTER = [0.0]*2
      if task == "task3":
          Y_COUNTER = [0.0]*4
  
      total = 0
  
      with open(sentiment_file) as f:
          for line in f:
              line = line.strip()
              line = line.split("\t")
  
              if task == "task1":
                  if line[1] == "negative":
                      Y_COUNTER[0] += 1
                  if line[1] == "positive":
                      Y_COUNTER[1] += 1
                  if line[1] == "objective":
                      Y_COUNTER[2] += 1
                  if line[1] == "mixed":
                      Y_COUNTER[3] += 1
                  total += 1
  
              if task == "task3":
                  if line[1] == "negative":
                      Y_COUNTER[0] += 1
                  if line[1] == "positive":
                      Y_COUNTER[1] += 1
                  if line[1] == "objective":
                      Y_COUNTER[2] += 1
                  if line[1] == "mixed":
                      Y_COUNTER[3] += 1
                  total += 1
  
              if task == "task2":
                  if line[1] == "figurative":
                      Y_COUNTER[0] += 1
                  if line[1] == "nonfigurative":
                      Y_COUNTER[1] += 1
                  total += 1
  
  
  
      print Y_COUNTER 
      hash_y_counter = None
      if task == "task1":
          hash_y_counter = {0: Y_COUNTER[0], 1: Y_COUNTER[1], 2: Y_COUNTER[2], 3: Y_COUNTER[3]}
      if task == "task3":
          hash_y_counter = {0: Y_COUNTER[0], 1: Y_COUNTER[1], 2: Y_COUNTER[2], 3: Y_COUNTER[3]}
      if task == "task2":
          hash_y_counter = {0: Y_COUNTER[0], 1: Y_COUNTER[1]}
      print hash_y_counter
      cw_y = create_class_weight(hash_y_counter, 0.5)
      print cw_y
  
  
  
      with open(sentiment_file) as f:
          for line in f:
              line = line.strip()
              line = line.split("\t")
  
              Z.append( line[0] )
  
              ar_y = None
              if task == "task1":
                  ar_y = [0]*4
  
                  if line[1] == "negative":
                      ar_y[0] = 1
                  if line[1] == "positive":
                      ar_y[1] = 1
                  if line[1] == "objective":
                      ar_y[2] = 1
                  if line[1] == "mixed":
                      ar_y[3] = 1
  
  
              if task == "task2":
                  ar_y = [0]*2
  
                  if line[1] == "figurative":
                      ar_y[0] = 1
                  if line[1] == "nonfigurative":
                      ar_y[1] = 1
  
  
              if task == "task3":
                  ar_y = [0]*4
  
                  if line[1] == "negative":
                      ar_y[0] = 1
                  if line[1] == "positive":
                      ar_y[1] = 1
                  if line[1] == "objective":
                      ar_y[2] = 1
                  if line[1] == "mixed":
                      ar_y[3] = 1
  
  
              Y.append( ar_y )
  
              tok = line[2].split(" ")
              word_ar = []
              for x in tok:
                  if x in word_word2vec:
                      word_ar.append( word_word2vec[ x ] )
              X_word.append( word_ar )
              
      X_word = np.array(X_word)
      Y = np.array(Y)
      Z = np.array(Z)
  
      return (Z, X_word, Y, cw_y)
  
  
  def read_sentiment_test(sentiment_file, word_word2vec):
      X_word = []
      Y = []
      Z = []
  
      with open(sentiment_file) as f:
          for line in f:
              line = line.strip()
              line = line.split("\t")
  
              Z.append( line[0] )
  
              ar_y = [0]
              Y.append( ar_y )
  
              tok = line[2].split(" ")
              word_ar = []
              for x in tok:
                  if x in word_word2vec:
                      word_ar.append( word_word2vec[ x ] )
              X_word.append( word_ar )
              
      X_word = np.array(X_word)
      Y = np.array(Y)
      Z = np.array(Z)
  
      return (Z, X_word, Y)
  
  
  
  
  def read_sentiment(sentiment_file, word_word2vec, task):
      X_word = []
      Y = []
      Z = []
  
      with open(sentiment_file) as f:
          for line in f:
              line = line.strip()
              line = line.split("\t")
  
              Z.append( line[0] )
  
              ar_y = None
              if task == "task1":
                  ar_y = [0]*4
  
                  if line[1] == "negative":
                      ar_y[0] = 1
                  if line[1] == "positive":
                      ar_y[1] = 1
                  if line[1] == "objective":
                      ar_y[2] = 1
                  if line[1] == "mixed":
                      ar_y[3] = 1
  
  
              if task == "task2":
                  ar_y = [0]*2
  
                  if line[1] == "figurative":
                      ar_y[0] = 1
                  if line[1] == "nonfigurative":
                      ar_y[1] = 1
  
  
              if task == "task3":
                  ar_y = [0]*4
  
                  if line[1] == "negative":
                      ar_y[0] = 1
                  if line[1] == "positive":
                      ar_y[1] = 1
                  if line[1] == "objective":
                      ar_y[2] = 1
                  if line[1] == "mixed":
                      ar_y[3] = 1
  
  
              Y.append( ar_y )
  
              tok = line[2].split(" ")
              word_ar = []
              for x in tok:
                  if x in word_word2vec:
                      word_ar.append( word_word2vec[ x ] )
              X_word.append( word_ar )
              
      X_word = np.array(X_word)
      Y = np.array(Y)
      Z = np.array(Z)
  
      return (Z, X_word, Y)
  
  
  
  maxlen = 150
  word_nb_feature_maps = 200
  hidden_size = 64
  
  word_embedding_file = sys.argv[1]
  test_file = sys.argv[2]
  model_file = sys.argv[3]
  
  
  model = model_from_json(open(model_file+".json").read())
  model.load_weights(model_file+".h5")
  
  
  word_vocab_size, word_embedding_size, word_dico, word_initialize_weight = read_word2vec(word_embedding_file)
  ID_train, X_word_train, Y_train = read_sentiment_test(test_file, word_dico)
  X_word_train = sequence.pad_sequences(X_word_train, maxlen=maxlen, padding='post', truncating='post')
  
  
  predict = model.predict(X_word_train, batch_size=2048, verbose=0)
  
  
  counter = 0
  for x in ID_train:
      print x+"\t"+" ".join(map(str, predict[counter]))
      counter += 1