Rouvier Mickael / Deft 2017 - Sentiment Analysis

Blame view

bin/tokenizer.rex 15.5 KB
  
  class TokenizerTwitter
  macro
  
      LATINSMALL  [a-zâàãäáæéèêëîïíìòœôöóûùüúçñÿ]
  
      LATINCAPITAL  [A-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒŒÔÖÓÛÙÜÚÇÑŸ]
  
      LATIN  [a-zâàãäáæéèêëîïíìòœôöóûùüúçñÿA-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒŒÔÖÓÛÙÜÚÇÑŸ]
  
      NOTLATIN  [^a-zâàãäáæéèêëîïíìòœôöóûùüúçñÿA-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒÔÖÓŒÛÙÜÚÇÑŸ]
  
      DIGIT  [0-9]
  
      ALPHADIGIT  [a-zâàãäáæéèêëîïíìòœôöóûùüúçñÿA-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒŒÔÖÓÛÙÜÚÇÑŸ0-9]
  
      USERNAME (\@({ALPHADIGIT}|_)+)
  
      HASHTAG (\#({ALPHADIGIT}|'|_|-)+)
  
      SMILEY (([<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP\/\:\}\{@\|\\]{1,3})|([\)\]\(\[dDpP\/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?))
  
      SMILEY_HEAD_BASE ((°|[\^\+\.\-\;\*=])([_~\-\-\^\.]){0,3}(°|[\^\+\.\-\;\*=]))
  
      SMILEY_HEAD ([\^=]?[\^\(=]{SMILEY_HEAD_BASE}[=\)\^][=\^]?)
  
      HEART (<(3)+)
  
      CONTROL [\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F]
  
      IGNORABLE [\x01\x02\x04\x05\x06\x07\x08\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x7F\xA0]
  
      SPACE [\t
  \v\f\r\ ]
  
      SUFIX (-(ce|CE|y|Y|en|EN|l(a|à|es?)|L(AÀ|ES?)|même|MÊME|[mts]oi|[MTS]OI|je|JE|tu|TU|on|ON|[nv]ous|[NV]OUS|lui|LUI|ils?|ILS?|elles?|ELLES?|(t-(ils?|elles?|on)|T-(ILS?|ELLES?|ON))))
  
      WPONCT ([,;:"'”“(){}&]|«|»|’|—|--|…|‘)
  
      SPONCT_MULTIPLE ([!?]+)
  
      SPONCT ([.!?¿¡…])
  
      SYMBOL ([\#$%&*+\/<=>@\[\\\]^_`|~]|–|¶|£|¥|₤|€|§|²|³|º|°|µ)
  
      ACRONYM ({LATIN}([.]{LATIN})+([.])?)
  
      HYPHEN (|-)
  
  
      URLChar ([:\/?\#\[\]@!%\-$&'()*+,;=_~.a-zA-Z0-9])
      URLScheme ([a-zA-Z][a-zA-Z+\-.]*)
      URLRest (([a-zA-Z0-9]{URLChar}*[a-zA-Z0-9])[.]([a-zA-Z0-9]{URLChar}*[a-zA-Z0-9]))
      URL (({URLScheme}?:?([\/][\/])?)?({URLRest})|{URLRest})
  
      EMAILChar ([A-Za-z0-9\-\._])
      EMAILDomain ([\.][A-Za-z0-9]+)
      EMAIL ({EMAILChar}+[@]{EMAILChar}+{EMAILDomain})
  
      SGMLTag (<[\/]?[^>]+>)
  
      ENDOFTEXT (\x03)
  
      EXCEPTION (Ce|Vers|Me|Le|De|Les|Des|Ces|Mère|Mer)
  
      ROMAIN (I|V|X|C|L|M|D)
  
      ABREVIATION (Mme|Mlle|Mr|M\.|Mr\.|Mrs\.)
  
      VINGT_A_SOIXANTE (vingt|trente|quarante|cinquante|soixante)
      VINGT_A_SOIXANTE_ET_QUATRE_VINGT (vingt|trente|quarante|cinquante|soixante|quatre(\ |-)vingts?)
      VINGT_A_QUATRE_VINGT_DIX (vingt|trente|quarante|cinquante|soixante|soixante(\ |-)dix|quatre(\ |-)vingts?|quatre(\ |-)vingt-dix)
      DEUX_A_NEUF (deux|trois|quatre|cinq|six|sept|huit|neuf)
      DIX_A_DIX_NEUF (dix|onze|douze|treize|quatorze|quinze|seize|dix(\ |-)sept|dix(\ |-)huit|dix(\ |-)neuf)
      DOUZE_A_DIX_NEUF (douze|treize|quatorze|quinze|seize|dix(\ |-)sept|dix(\ |-)huit|dix(\ |-)neuf)
      SOIXANTE_ET_QUATRE_VINGT (soixante|quatre(\ |-)vingts?)
  
      DEUX_A_QUATRE_VINGT_DIX_NEUF ({DEUX_A_NEUF}|{DIX_A_DIX_NEUF}|{VINGT_A_QUATRE_VINGT_DIX}|{VINGT_A_SOIXANTE}(\ |-)et(\ |-)un|soixante(\ |-)et(\ |-)onze|quatre(\ |-)vingts?(\ |-)un|quatre(\ |-)vingt(\ |-)un|quatre(\ |-)vingts?(\ |-)onze|quatre(\ |-)vingts?(\ |-)onze|{VINGT_A_SOIXANTE_ET_QUATRE_VINGT}(\ |-){DEUX_A_NEUF}|{SOIXANTE_ET_QUATRE_VINGT}(\ |-){DOUZE_A_DIX_NEUF})
  
      DEUX_A_NEUF_CENT_QUATRE_VINGT_DIX_NEUF ({DEUX_A_QUATRE_VINGT_DIX_NEUF}|cent(\ |-){DEUX_A_QUATRE_VINGT_DIX_NEUF}|{DEUX_A_NEUF}(\ |-)cents?(\ |-){DEUX_A_QUATRE_VINGT_DIX_NEUF}|{DEUX_A_NEUF}(\ |-)cents)
  
      DEUX_A_999999 ({DEUX_A_NEUF_CENT_QUATRE_VINGT_DIX_NEUF}|mille|{DEUX_A_NEUF_CENT_QUATRE_VINGT_DIX_NEUF}(\ |-)mille|mille(\ |-){DEUX_A_NEUF_CENT_QUATRE_VINGT_DIX_NEUF}|{DEUX_A_NEUF_CENT_QUATRE_VINGT_DIX_NEUF}(\ |-)mille(\ |-){DEUX_A_NEUF_CENT_QUATRE_VINGT_DIX_NEUF})
  
      APOS ['’´]
  
      NUMERO [Nn](\ )?(º|°)
  
      CARDINAL {DIGIT}+([',.\/\ ]{DIGIT}+)*
  
      NEGATION ((isn|aren|wasn|weren|haven|hasn|hadn|won|wouldn|don|doesn|didn|can|couldn|shouldn|mightn|mustn)'t)
  
  rule
      &amp; { ["wtoken", "&"] }
      {NEGATION} { ["wtoken_negation", text] }
      {URL} {["url", "#URL"] }
      {HEART}   { ["heart", text] }
      {SMILEY}  { ["smiley", text] }
      {SMILEY_HEAD_BASE}|{SMILEY_HEAD}  { ["smiley", text] }
      {USERNAME} { ["twitter_username", "#TWITTERUSERNAME"] }
      {HASHTAG} { ["twitter_hashtag", text] }
      #{DEUX_A_999999} { ["rcardinal", "#CHIFFRE"] }
      {APOS} { ["apos", text] }
      {NUMERO} { ["wtoken", text] }
      ({DIGIT}+|(I|V|X|C|L|M|D)+)(st|nd|rd|th) { ["ordinal", text] }
      [012]?{DIGIT}[hH][012345]{DIGIT} { ["timex-time", text] }
      ({DIGIT}+([Hh]|[Mm](i?n)?|[Ss](ec)?))+ { ["timex-chrono", text] }
      ({ROMAIN}{ROMAIN}+) { ["romain", text] }
      {CARDINAL} { ["cardinal", "#CARDINAL"] }
      {ACRONYM} {["acronym", text] }
      {ALPHADIGIT}{ALPHADIGIT}*  { ["wtoken", text] }
      {SGMLTag} {["sgmltag", text] }
      {WPONCT} {["ponctw", text] }
      {LATINCAPITAL}[h]?[.] {["initial", text] }
      {HYPHEN} {["hyphen", text] }
      {SPONCT_MULTIPLE}|{SPONCT} {["poncts", text] }
      {SYMBOL} {["symbol", text] }
      {EMAIL} {["email", text] }
      {SPACE}+ {["space", text] }
  
      . { ["no_match", text] }
  
  inner
  def tokenize(code)
      scan_setup(code)
      tokens = []
      while token = next_token
          tokens.push( token )
      end
      tokens
  end
  
  end
  
  
  def _accent(str)
  
      temp_word = str
  
      temp_word = temp_word.gsub("&Agrave;", "à")
      temp_word = temp_word.gsub("&agrave;", "à")
      temp_word = temp_word.gsub("&Acirc;", "â")
      temp_word = temp_word.gsub("&acirc;", "â")
      temp_word = temp_word.gsub("&Ccedil;", "ç")
      temp_word = temp_word.gsub("&ccedil;", "ç")
      temp_word = temp_word.gsub("&Egrave;", "è")
      temp_word = temp_word.gsub("&egrave;", "è")
      temp_word = temp_word.gsub("&Eacute;", "é")
      temp_word = temp_word.gsub("&eacute;", "é")
      temp_word = temp_word.gsub("&Ecirc;", "ê")
      temp_word = temp_word.gsub("&ecirc;", "ê")
      temp_word = temp_word.gsub("&Euml;", "ë")
      temp_word = temp_word.gsub("&euml;", "ë")
      temp_word = temp_word.gsub("&Icirc;", "î")
      temp_word = temp_word.gsub("&icirc;", "î")
      temp_word = temp_word.gsub("&Iuml;", "ï")
      temp_word = temp_word.gsub("&iuml;", "ï")
      temp_word = temp_word.gsub("&Ocirc;", "ô")
      temp_word = temp_word.gsub("&ocirc;", "ô")
      temp_word = temp_word.gsub("&OElig;", "œ")
      temp_word = temp_word.gsub("&oelig;", "œ")
      temp_word = temp_word.gsub("&Ugrave;", "ù")
      temp_word = temp_word.gsub("&ugrave;", "ù")
      temp_word = temp_word.gsub("&Ucirc;", "û")
      temp_word = temp_word.gsub("&ucirc;", "û")
      temp_word = temp_word.gsub("&Uuml;", "ü")
      temp_word = temp_word.gsub("&uuml;", "ü")
      temp_word = temp_word.gsub("&#376;", "ÿ")
      temp_word = temp_word.gsub("&yuml;", "ÿ")
  
      temp_word = temp_word.gsub("À", "à")
      temp_word = temp_word.gsub("Â", "â")
      temp_word = temp_word.gsub("Ç", "ç")
      temp_word = temp_word.gsub("È", "è")
      temp_word = temp_word.gsub("É", "é")
      temp_word = temp_word.gsub("Ê", "ê")
      temp_word = temp_word.gsub("Ë", "ë")
      temp_word = temp_word.gsub("Î", "î")
      temp_word = temp_word.gsub("Ï", "ï")
      temp_word = temp_word.gsub("Ô", "î")
      temp_word = temp_word.gsub("Œ", "œ")
      temp_word = temp_word.gsub("Ù", "ù")
      temp_word = temp_word.gsub("Û", "û")
      temp_word = temp_word.gsub("Ü", "ü")
      temp_word = temp_word.gsub("Ü", "ü")
  
      temp_word = temp_word.gsub("Ÿ", "ÿ")
      temp_word = temp_word.gsub("Ÿ", "ÿ")
  
      #Spanish
      temp_word = temp_word.gsub("Ñ", "ñ")
      temp_word = temp_word.gsub("Á", "á")
      temp_word = temp_word.gsub("Ó", "ó")
      temp_word = temp_word.gsub("Ú", "ú")
      temp_word = temp_word.gsub("Í", "í")
  
      return temp_word
  
  end
  
  
  def _html(str)
      temp_word = str
  
      temp_word = temp_word.gsub("&gt;", "<")
      temp_word = temp_word.gsub("&lt;", ">")
  
      return temp_word
  end
  
  def squeezeWhitespace(str)
      return str.gsub(/[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+/, " ")
  end
  
  
  def _downcase(str)
      return str.downcase
  end
  
  def _cleanword(string)
      return string.scan(/((.)\2*)/).map(&:first).map { |x| x[0..2] }.join("")
  end
  
  def _convert(string)
      temp_word = string
  
      temp_word = temp_word.gsub("â¬", "€")
      temp_word = temp_word.gsub("Ã¢", "â")
      temp_word = temp_word.gsub("Ã©", "é")
      temp_word = temp_word.gsub("Ã¨", "è")
      temp_word = temp_word.gsub("Ãª", "ê")
      temp_word = temp_word.gsub("Ã«", "ë")
      temp_word = temp_word.gsub("Ã®", "î")
      temp_word = temp_word.gsub("Ã¯", "ï")
      temp_word = temp_word.gsub("Ã¶", "ö")
      temp_word = temp_word.gsub("Ã¹", "ù")
      temp_word = temp_word.gsub("Ã»", "û")
      temp_word = temp_word.gsub("Ã¼", "ü")
      temp_word = temp_word.gsub("Ã§", "ç")
      temp_word = temp_word.gsub("Å", "œ")
      temp_word = temp_word.gsub("Â°", "°")
      temp_word = temp_word.gsub("Ã ", "à ")
      temp_word = temp_word.gsub("Ã", "ô")
  
      return temp_word
  end
  
  def all_caps(str)
      evaluator = TokenizerTwitter.new
      ar = evaluator.tokenize( _html( _accent( squeezeWhitespace( _cleanword( _convert( str ) ) )) ) )
  
      counter = 0
  
      ar.each do |x|
          if x[0] == "wtoken"
              if x[1].match(/^([A-Z]+)$/) 
                  if x[1].upcase == x[1]
                      counter += 1
                  end
              end
          end
      end
  
      return counter
  end
  
  def elongated_words(str)
      evaluator = TokenizerTwitter.new
  
      ar = evaluator.tokenize( _html( _accent( squeezeWhitespace( _convert( str ) ) ) )  )
  
      counter = 0
  
      ar.each do |x|
          if x[1] != _cleanword(x[1])
              counter += 1
          end
      end
  
      return counter
  end
  
  def last_tokens_emoticons?(str)
      evaluator = TokenizerTwitter.new
      ar = evaluator.tokenize( _html( _accent( squeezeWhitespace(_downcase(  _cleanword( _convert( str ) ) )) ) ) )
      return +1 if ar.last[0] == "smiley"
      return -1
  end
  
  def emoticons(str)
      evaluator = TokenizerTwitter.new
      ar = evaluator.tokenize( _html( _accent( squeezeWhitespace(_downcase(  _cleanword( _convert( str ) ) )) ) ) )
      counter = 0
      ar.each do |x|
          if x[0] == "smiley"
              counter += 1
          end
      end
      return counter
  end
  
  def last_tokens_url?(str)
      evaluator = TokenizerTwitter.new
      ar = evaluator.tokenize( _html( _accent( squeezeWhitespace(_downcase(  _cleanword( _convert( str ) ) )) ) ) )
      return +1 if ar.last[0] == "url"
      return -1
  end
  
  def ponctuation(str)
      ponct = Array.new(3, 0)
      evaluator = TokenizerTwitter.new
      ar = evaluator.tokenize( _html( _accent( squeezeWhitespace(_downcase(  _cleanword( _convert( str ) ) )) ) ) )
      ar.each do |x|
          if x[1].match(/^(\?\?+)$/)
              ponct[0] += 1
          end
  
          if x[1].match(/^(!!+)$/)
              ponct[1] += 1
          end
  
          if x[1].include?("?!") == true
              ponct[2] += 1
          elsif x[1].include?("!?") == true
              ponct[2] += 1
          else
              #nothing
          end
  
      end
  
      return ponct
  end
  
  
  def semi_tokenize(str)
      y = Array.new
      evaluator = TokenizerTwitter.new
      ar = evaluator.tokenize( _html( _accent( squeezeWhitespace(_downcase(  _cleanword( _convert( str ) ) )) ) ) )
  
      ar.each do |x|
          y.push(x[1])
      end
  
      return y.join(" ")
  end
  
  def tokenize_charliehebdo(str)
      hash_tags = Array.new
      y = Array.new
      evaluator = TokenizerTwitter.new
      ar = evaluator.tokenize( _html( _accent( squeezeWhitespace( _downcase(  _cleanword( _convert( str ) ) ) ) ) ) )
  
      ar.each do |x|
          if x[0] != "ponctw"
              if x[0] != "space"
                  if x[0] != "sgmltag"
                      if x[0] != "hyphen"
                          if x[0] != "poncts"
                              if x[0] != "apos"
                                  if x[0] != "symbol"
                                      #x[1] = "pour" if x[1] == "pr"
                                      #x[1] = "putain" if x[1] == "ptn"
                                      #x[1] = "maintenant" if x[1] == "mnt"
                                      #x[1] = "beaucoup" if x[1] == "bcp"
  
  
                                  x[1] = "isn't" if x[1] == "isnt"
                                  x[1] = "aren't" if x[1] == "arent"
                                  x[1] = "wasn't" if x[1] == "wasnt"
                                  x[1] = "weren't" if x[1] == "werent"
                                  x[1] = "haven't" if x[1] == "havent"
                                  x[1] = "hasn't" if x[1] == "hasnt"
                                  x[1] = "hadn't" if x[1] == "hadnt"
                                  x[1] = "won't" if x[1] == "wont"
                                  x[1] = "wouldn't" if x[1] == "wouldnt"
                                  x[1] = "don't" if x[1] == "dont"
                                  x[1] = "doesn't" if x[1] == "doesnt"
                                  x[1] = "didn't" if x[1] == "didnt"
                                  x[1] = "can't" if x[1] == "cant"
                                  x[1] = "couldn't" if x[1] == "couldnt"
                                  x[1] = "shouldn't" if x[1] == "shouldnt"
                                  x[1] = "mightn't" if x[1] == "mightnt"
                                  x[1] = "mustn't" if x[1] == "mustnt"
                                      
  
                                      if x[0] == "twitter_hashtag"
                                          hash_tags.push( x[1] )
                                      end
  
                                      y.push(x[1])
                                  end
                              end
                          end
                      end
                  end
              end
          end
      end
  
      return y.join(" "), hash_tags
  end
  
  
  
  def tokenize(str)
      y = Array.new
      evaluator = TokenizerTwitter.new
      ar = evaluator.tokenize( _html( _accent( squeezeWhitespace( _downcase(  _cleanword( _convert( str ) ) ) ) ) ) )
  
      ar.each do |x|
          if x[0] != "ponctw"
              if x[0] != "space"
                  if x[0] != "sgmltag"
                      if x[0] != "hyphen"
                          if x[0] != "poncts"
                              if x[0] != "apos"
                                  if x[0] != "symbol"
  
                                  x[1] = "isn't" if x[1] == "isnt"
                                  x[1] = "aren't" if x[1] == "arent"
                                  x[1] = "wasn't" if x[1] == "wasnt"
                                  x[1] = "weren't" if x[1] == "werent"
                                  x[1] = "haven't" if x[1] == "havent"
                                  x[1] = "hasn't" if x[1] == "hasnt"
                                  x[1] = "hadn't" if x[1] == "hadnt"
                                  x[1] = "won't" if x[1] == "wont"
                                  x[1] = "wouldn't" if x[1] == "wouldnt"
                                  x[1] = "don't" if x[1] == "dont"
                                  x[1] = "doesn't" if x[1] == "doesnt"
                                  x[1] = "didn't" if x[1] == "didnt"
                                  x[1] = "can't" if x[1] == "cant"
                                  x[1] = "couldn't" if x[1] == "couldnt"
                                  x[1] = "shouldn't" if x[1] == "shouldnt"
                                  x[1] = "mightn't" if x[1] == "mightnt"
                                  x[1] = "mustn't" if x[1] == "mustnt"
                                      
  
                                      if x[1] != ".."
                                          if x[1] != "..."
                                              y.push(x[1])
                                          end
                                      end
                                  end
                              end
                          end
                      end
                  end
              end
          end
      end
  
      return y.join(" ")
  end