Rouvier Mickael / Deft 2017 - Sentiment Analysis

Blame view

bin/tokenizer.rb 15.5 KB
  #--
  # DO NOT MODIFY!!!!
  # This file is automatically generated by rex 1.0.5
  # from lexical definition file "bin/tokenizer.rex".
  #++
  
  require 'racc/parser'
  
  class TokenizerTwitter < Racc::Parser
    require 'strscan'
  
    class ScanError < StandardError ; end
  
    attr_reader   :lineno
    attr_reader   :filename
    attr_accessor :state
  
    def scan_setup(str)
      @ss = StringScanner.new(str)
      @lineno =  1
      @state  = nil
    end
  
    def action
      yield
    end
  
    def scan_str(str)
      scan_setup(str)
      do_parse
    end
    alias :scan :scan_str
  
    def load_file( filename )
      @filename = filename
      open(filename, "r") do |f|
        scan_setup(f.read)
      end
    end
  
    def scan_file( filename )
      load_file(filename)
      do_parse
    end
  
  
    def next_token
      return if @ss.eos?
      
      # skips empty actions
      until token = _next_token or @ss.eos?; end
      token
    end
  
    def _next_token
      text = @ss.peek(1)
      @lineno  +=  1  if text == "
  "
      token = case @state
      when nil
        case
        when (text = @ss.scan(/&amp;/))
           action { ["wtoken", "&"] }
  
        when (text = @ss.scan(/((isn|aren|wasn|weren|haven|hasn|hadn|won|wouldn|don|doesn|didn|can|couldn|shouldn|mightn|mustn)'t)/))
           action { ["wtoken_negation", text] }
  
        when (text = @ss.scan(/((([a-zA-Z][a-zA-Z+\-.]*)?:?([\/][\/])?)?((([a-zA-Z0-9]([:\/?\#\[\]@!%\-$&'()*+,;=_~.a-zA-Z0-9])*[a-zA-Z0-9])[.]([a-zA-Z0-9]([:\/?\#\[\]@!%\-$&'()*+,;=_~.a-zA-Z0-9])*[a-zA-Z0-9])))|(([a-zA-Z0-9]([:\/?\#\[\]@!%\-$&'()*+,;=_~.a-zA-Z0-9])*[a-zA-Z0-9])[.]([a-zA-Z0-9]([:\/?\#\[\]@!%\-$&'()*+,;=_~.a-zA-Z0-9])*[a-zA-Z0-9])))/))
           action {["url", "#URL"] }
  
        when (text = @ss.scan(/(<(3)+)/))
           action { ["heart", text] }
  
        when (text = @ss.scan(/(([<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP\/\:\}\{@\|\\]{1,3})|([\)\]\(\[dDpP\/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?))/))
           action { ["smiley", text] }
  
        when (text = @ss.scan(/((°|[\^\+\.\-\;\*=])([_~\-\-\^\.]){0,3}(°|[\^\+\.\-\;\*=]))|([\^=]?[\^\(=]((°|[\^\+\.\-\;\*=])([_~\-\-\^\.]){0,3}(°|[\^\+\.\-\;\*=]))[=\)\^][=\^]?)/))
           action { ["smiley", text] }
  
        when (text = @ss.scan(/(\@([a-zâàãäáæéèêëîïíìòœôöóûùüúçñÿA-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒŒÔÖÓÛÙÜÚÇÑŸ0-9]|_)+)/))
           action { ["twitter_username", "#TWITTERUSERNAME"] }
  
        when (text = @ss.scan(/(\#([a-zâàãäáæéèêëîïíìòœôöóûùüúçñÿA-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒŒÔÖÓÛÙÜÚÇÑŸ0-9]|'|_|-)+)/))
           action { ["twitter_hashtag", text] }
  
        when (text = @ss.scan(/['’´]/))
           action { ["apos", text] }
  
        when (text = @ss.scan(/[Nn]( )?(º|°)/))
           action { ["wtoken", text] }
  
        when (text = @ss.scan(/([0-9]+|(I|V|X|C|L|M|D)+)(st|nd|rd|th)/))
           action { ["ordinal", text] }
  
        when (text = @ss.scan(/[012]?[0-9][hH][012345][0-9]/))
           action { ["timex-time", text] }
  
        when (text = @ss.scan(/([0-9]+([Hh]|[Mm](i?n)?|[Ss](ec)?))+/))
           action { ["timex-chrono", text] }
  
        when (text = @ss.scan(/((I|V|X|C|L|M|D)(I|V|X|C|L|M|D)+)/))
           action { ["romain", text] }
  
        when (text = @ss.scan(/[0-9]+([',.\/ ][0-9]+)*/))
           action { ["cardinal", "#CARDINAL"] }
  
        when (text = @ss.scan(/([a-zâàãäáæéèêëîïíìòœôöóûùüúçñÿA-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒŒÔÖÓÛÙÜÚÇÑŸ]([.][a-zâàãäáæéèêëîïíìòœôöóûùüúçñÿA-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒŒÔÖÓÛÙÜÚÇÑŸ])+([.])?)/))
           action {["acronym", text] }
  
        when (text = @ss.scan(/[a-zâàãäáæéèêëîïíìòœôöóûùüúçñÿA-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒŒÔÖÓÛÙÜÚÇÑŸ0-9][a-zâàãäáæéèêëîïíìòœôöóûùüúçñÿA-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒŒÔÖÓÛÙÜÚÇÑŸ0-9]*/))
           action { ["wtoken", text] }
  
        when (text = @ss.scan(/(<[\/]?[^>]+>)/))
           action {["sgmltag", text] }
  
        when (text = @ss.scan(/([,;:"'”“(){}&]|«|»|’|—|--|…|‘)/))
           action {["ponctw", text] }
  
        when (text = @ss.scan(/[A-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒŒÔÖÓÛÙÜÚÇÑŸ][h]?[.]/))
           action {["initial", text] }
  
        when (text = @ss.scan(/(|-)/))
           action {["hyphen", text] }
  
        when (text = @ss.scan(/([!?]+)|([.!?¿¡…])/))
           action {["poncts", text] }
  
        when (text = @ss.scan(/([\#$%&*+\/<=>@\[\\\]^_`|~]|–|¶|£|¥|₤|€|§|²|³|º|°|µ)/))
           action {["symbol", text] }
  
        when (text = @ss.scan(/(([A-Za-z0-9\-\._])+[@]([A-Za-z0-9\-\._])+([\.][A-Za-z0-9]+))/))
           action {["email", text] }
  
        when (text = @ss.scan(/[\t
  \v\f\r ]+/))
           action {["space", text] }
  
        when (text = @ss.scan(/./))
           action { ["no_match", text] }
  
        else
          text = @ss.string[@ss.pos .. -1]
          raise  ScanError, "can not match: '" + text + "'"
        end  # if
  
      else
        raise  ScanError, "undefined state: '" + state.to_s + "'"
      end  # case state
      token
    end  # def _next_token
  
  def tokenize(code)
      scan_setup(code)
      tokens = []
      while token = next_token
          tokens.push( token )
      end
      tokens
  end # class
  
  end
  
  
  def _accent(str)
  
      temp_word = str
  
      temp_word = temp_word.gsub("&Agrave;", "à")
      temp_word = temp_word.gsub("&agrave;", "à")
      temp_word = temp_word.gsub("&Acirc;", "â")
      temp_word = temp_word.gsub("&acirc;", "â")
      temp_word = temp_word.gsub("&Ccedil;", "ç")
      temp_word = temp_word.gsub("&ccedil;", "ç")
      temp_word = temp_word.gsub("&Egrave;", "è")
      temp_word = temp_word.gsub("&egrave;", "è")
      temp_word = temp_word.gsub("&Eacute;", "é")
      temp_word = temp_word.gsub("&eacute;", "é")
      temp_word = temp_word.gsub("&Ecirc;", "ê")
      temp_word = temp_word.gsub("&ecirc;", "ê")
      temp_word = temp_word.gsub("&Euml;", "ë")
      temp_word = temp_word.gsub("&euml;", "ë")
      temp_word = temp_word.gsub("&Icirc;", "î")
      temp_word = temp_word.gsub("&icirc;", "î")
      temp_word = temp_word.gsub("&Iuml;", "ï")
      temp_word = temp_word.gsub("&iuml;", "ï")
      temp_word = temp_word.gsub("&Ocirc;", "ô")
      temp_word = temp_word.gsub("&ocirc;", "ô")
      temp_word = temp_word.gsub("&OElig;", "œ")
      temp_word = temp_word.gsub("&oelig;", "œ")
      temp_word = temp_word.gsub("&Ugrave;", "ù")
      temp_word = temp_word.gsub("&ugrave;", "ù")
      temp_word = temp_word.gsub("&Ucirc;", "û")
      temp_word = temp_word.gsub("&ucirc;", "û")
      temp_word = temp_word.gsub("&Uuml;", "ü")
      temp_word = temp_word.gsub("&uuml;", "ü")
      temp_word = temp_word.gsub("&#376;", "ÿ")
      temp_word = temp_word.gsub("&yuml;", "ÿ")
  
      temp_word = temp_word.gsub("À", "à")
      temp_word = temp_word.gsub("Â", "â")
      temp_word = temp_word.gsub("Ç", "ç")
      temp_word = temp_word.gsub("È", "è")
      temp_word = temp_word.gsub("É", "é")
      temp_word = temp_word.gsub("Ê", "ê")
      temp_word = temp_word.gsub("Ë", "ë")
      temp_word = temp_word.gsub("Î", "î")
      temp_word = temp_word.gsub("Ï", "ï")
      temp_word = temp_word.gsub("Ô", "î")
      temp_word = temp_word.gsub("Œ", "œ")
      temp_word = temp_word.gsub("Ù", "ù")
      temp_word = temp_word.gsub("Û", "û")
      temp_word = temp_word.gsub("Ü", "ü")
      temp_word = temp_word.gsub("Ü", "ü")
  
      temp_word = temp_word.gsub("Ÿ", "ÿ")
      temp_word = temp_word.gsub("Ÿ", "ÿ")
  
      #Spanish
      temp_word = temp_word.gsub("Ñ", "ñ")
      temp_word = temp_word.gsub("Á", "á")
      temp_word = temp_word.gsub("Ó", "ó")
      temp_word = temp_word.gsub("Ú", "ú")
      temp_word = temp_word.gsub("Í", "í")
  
      return temp_word
  
  end
  
  
  def _html(str)
      temp_word = str
  
      temp_word = temp_word.gsub("&gt;", "<")
      temp_word = temp_word.gsub("&lt;", ">")
  
      return temp_word
  end
  
  def squeezeWhitespace(str)
      return str.gsub(/[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+/, " ")
  end
  
  
  def _downcase(str)
      return str.downcase
  end
  
  def _cleanword(string)
      return string.scan(/((.)\2*)/).map(&:first).map { |x| x[0..2] }.join("")
  end
  
  def _convert(string)
      temp_word = string
  
      temp_word = temp_word.gsub("â¬", "€")
      temp_word = temp_word.gsub("Ã¢", "â")
      temp_word = temp_word.gsub("Ã©", "é")
      temp_word = temp_word.gsub("Ã¨", "è")
      temp_word = temp_word.gsub("Ãª", "ê")
      temp_word = temp_word.gsub("Ã«", "ë")
      temp_word = temp_word.gsub("Ã®", "î")
      temp_word = temp_word.gsub("Ã¯", "ï")
      temp_word = temp_word.gsub("Ã¶", "ö")
      temp_word = temp_word.gsub("Ã¹", "ù")
      temp_word = temp_word.gsub("Ã»", "û")
      temp_word = temp_word.gsub("Ã¼", "ü")
      temp_word = temp_word.gsub("Ã§", "ç")
      temp_word = temp_word.gsub("Å", "œ")
      temp_word = temp_word.gsub("Â°", "°")
      temp_word = temp_word.gsub("Ã ", "à ")
      temp_word = temp_word.gsub("Ã", "ô")
  
      return temp_word
  end
  
  def all_caps(str)
      evaluator = TokenizerTwitter.new
      ar = evaluator.tokenize( _html( _accent( squeezeWhitespace( _cleanword( _convert( str ) ) )) ) )
  
      counter = 0
  
      ar.each do |x|
          if x[0] == "wtoken"
              if x[1].match(/^([A-Z]+)$/) 
                  if x[1].upcase == x[1]
                      counter += 1
                  end
              end
          end
      end
  
      return counter
  end
  
  def elongated_words(str)
      evaluator = TokenizerTwitter.new
  
      ar = evaluator.tokenize( _html( _accent( squeezeWhitespace( _convert( str ) ) ) )  )
  
      counter = 0
  
      ar.each do |x|
          if x[1] != _cleanword(x[1])
              counter += 1
          end
      end
  
      return counter
  end
  
  def last_tokens_emoticons?(str)
      evaluator = TokenizerTwitter.new
      ar = evaluator.tokenize( _html( _accent( squeezeWhitespace(_downcase(  _cleanword( _convert( str ) ) )) ) ) )
      return +1 if ar.last[0] == "smiley"
      return -1
  end
  
  def emoticons(str)
      evaluator = TokenizerTwitter.new
      ar = evaluator.tokenize( _html( _accent( squeezeWhitespace(_downcase(  _cleanword( _convert( str ) ) )) ) ) )
      counter = 0
      ar.each do |x|
          if x[0] == "smiley"
              counter += 1
          end
      end
      return counter
  end
  
  def last_tokens_url?(str)
      evaluator = TokenizerTwitter.new
      ar = evaluator.tokenize( _html( _accent( squeezeWhitespace(_downcase(  _cleanword( _convert( str ) ) )) ) ) )
      return +1 if ar.last[0] == "url"
      return -1
  end
  
  def ponctuation(str)
      ponct = Array.new(3, 0)
      evaluator = TokenizerTwitter.new
      ar = evaluator.tokenize( _html( _accent( squeezeWhitespace(_downcase(  _cleanword( _convert( str ) ) )) ) ) )
      ar.each do |x|
          if x[1].match(/^(\?\?+)$/)
              ponct[0] += 1
          end
  
          if x[1].match(/^(!!+)$/)
              ponct[1] += 1
          end
  
          if x[1].include?("?!") == true
              ponct[2] += 1
          elsif x[1].include?("!?") == true
              ponct[2] += 1
          else
              #nothing
          end
  
      end
  
      return ponct
  end
  
  
  def semi_tokenize(str)
      y = Array.new
      evaluator = TokenizerTwitter.new
      ar = evaluator.tokenize( _html( _accent( squeezeWhitespace(_downcase(  _cleanword( _convert( str ) ) )) ) ) )
  
      ar.each do |x|
          y.push(x[1])
      end
  
      return y.join(" ")
  end
  
  def tokenize_charliehebdo(str)
      hash_tags = Array.new
      y = Array.new
      evaluator = TokenizerTwitter.new
      ar = evaluator.tokenize( _html( _accent( squeezeWhitespace( _downcase(  _cleanword( _convert( str ) ) ) ) ) ) )
  
      ar.each do |x|
          if x[0] != "ponctw"
              if x[0] != "space"
                  if x[0] != "sgmltag"
                      if x[0] != "hyphen"
                          if x[0] != "poncts"
                              if x[0] != "apos"
                                  if x[0] != "symbol"
                                      #x[1] = "pour" if x[1] == "pr"
                                      #x[1] = "putain" if x[1] == "ptn"
                                      #x[1] = "maintenant" if x[1] == "mnt"
                                      #x[1] = "beaucoup" if x[1] == "bcp"
  
  
                                  x[1] = "isn't" if x[1] == "isnt"
                                  x[1] = "aren't" if x[1] == "arent"
                                  x[1] = "wasn't" if x[1] == "wasnt"
                                  x[1] = "weren't" if x[1] == "werent"
                                  x[1] = "haven't" if x[1] == "havent"
                                  x[1] = "hasn't" if x[1] == "hasnt"
                                  x[1] = "hadn't" if x[1] == "hadnt"
                                  x[1] = "won't" if x[1] == "wont"
                                  x[1] = "wouldn't" if x[1] == "wouldnt"
                                  x[1] = "don't" if x[1] == "dont"
                                  x[1] = "doesn't" if x[1] == "doesnt"
                                  x[1] = "didn't" if x[1] == "didnt"
                                  x[1] = "can't" if x[1] == "cant"
                                  x[1] = "couldn't" if x[1] == "couldnt"
                                  x[1] = "shouldn't" if x[1] == "shouldnt"
                                  x[1] = "mightn't" if x[1] == "mightnt"
                                  x[1] = "mustn't" if x[1] == "mustnt"
                                      
  
                                      if x[0] == "twitter_hashtag"
                                          hash_tags.push( x[1] )
                                      end
  
                                      y.push(x[1])
                                  end
                              end
                          end
                      end
                  end
              end
          end
      end
  
      return y.join(" "), hash_tags
  end
  
  
  
  def tokenize(str)
      y = Array.new
      evaluator = TokenizerTwitter.new
      ar = evaluator.tokenize( _html( _accent( squeezeWhitespace( _downcase(  _cleanword( _convert( str ) ) ) ) ) ) )
  
      ar.each do |x|
          if x[0] != "ponctw"
              if x[0] != "space"
                  if x[0] != "sgmltag"
                      if x[0] != "hyphen"
                          if x[0] != "poncts"
                              if x[0] != "apos"
                                  if x[0] != "symbol"
  
                                  x[1] = "isn't" if x[1] == "isnt"
                                  x[1] = "aren't" if x[1] == "arent"
                                  x[1] = "wasn't" if x[1] == "wasnt"
                                  x[1] = "weren't" if x[1] == "werent"
                                  x[1] = "haven't" if x[1] == "havent"
                                  x[1] = "hasn't" if x[1] == "hasnt"
                                  x[1] = "hadn't" if x[1] == "hadnt"
                                  x[1] = "won't" if x[1] == "wont"
                                  x[1] = "wouldn't" if x[1] == "wouldnt"
                                  x[1] = "don't" if x[1] == "dont"
                                  x[1] = "doesn't" if x[1] == "doesnt"
                                  x[1] = "didn't" if x[1] == "didnt"
                                  x[1] = "can't" if x[1] == "cant"
                                  x[1] = "couldn't" if x[1] == "couldnt"
                                  x[1] = "shouldn't" if x[1] == "shouldnt"
                                  x[1] = "mightn't" if x[1] == "mightnt"
                                  x[1] = "mustn't" if x[1] == "mustnt"
                                      
  
                                      if x[1] != ".."
                                          if x[1] != "..."
                                              y.push(x[1])
                                          end
                                      end
                                  end
                              end
                          end
                      end
                  end
              end
          end
      end
  
      return y.join(" ")
  end