#--
# DO NOT MODIFY!!!!
# This file is automatically generated by rex 1.0.5
# from lexical definition file "bin/tokenizer.rex".
#++

require 'racc/parser'

class TokenizerTwitter < Racc::Parser
  require 'strscan'

  class ScanError < StandardError ; end

  attr_reader   :lineno
  attr_reader   :filename
  attr_accessor :state

  def scan_setup(str)
    @ss = StringScanner.new(str)
    @lineno =  1
    @state  = nil
  end

  def action
    yield
  end

  def scan_str(str)
    scan_setup(str)
    do_parse
  end
  alias :scan :scan_str

  def load_file( filename )
    @filename = filename
    open(filename, "r") do |f|
      scan_setup(f.read)
    end
  end

  def scan_file( filename )
    load_file(filename)
    do_parse
  end


  def next_token
    return if @ss.eos?
    
    # skips empty actions
    until token = _next_token or @ss.eos?; end
    token
  end

  def _next_token
    text = @ss.peek(1)
    @lineno  +=  1  if text == "\n"
    token = case @state
    when nil
      case
      when (text = @ss.scan(/&amp;/))
         action { ["wtoken", "&"] }

      when (text = @ss.scan(/((isn|aren|wasn|weren|haven|hasn|hadn|won|wouldn|don|doesn|didn|can|couldn|shouldn|mightn|mustn)'t)/))
         action { ["wtoken_negation", text] }

      when (text = @ss.scan(/((([a-zA-Z][a-zA-Z+\-.]*)?:?([\/][\/])?)?((([a-zA-Z0-9]([:\/?\#\[\]@!%\-$&'()*+,;=_~.a-zA-Z0-9])*[a-zA-Z0-9])[.]([a-zA-Z0-9]([:\/?\#\[\]@!%\-$&'()*+,;=_~.a-zA-Z0-9])*[a-zA-Z0-9])))|(([a-zA-Z0-9]([:\/?\#\[\]@!%\-$&'()*+,;=_~.a-zA-Z0-9])*[a-zA-Z0-9])[.]([a-zA-Z0-9]([:\/?\#\[\]@!%\-$&'()*+,;=_~.a-zA-Z0-9])*[a-zA-Z0-9])))/))
         action {["url", "#URL"] }

      when (text = @ss.scan(/(<(3)+)/))
         action { ["heart", text] }

      when (text = @ss.scan(/(([<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP\/\:\}\{@\|\\]{1,3})|([\)\]\(\[dDpP\/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?))/))
         action { ["smiley", text] }

      when (text = @ss.scan(/((°|[\^\+\.\-\;\*=])([_~\-\-\^\.]){0,3}(°|[\^\+\.\-\;\*=]))|([\^=]?[\^\(=]((°|[\^\+\.\-\;\*=])([_~\-\-\^\.]){0,3}(°|[\^\+\.\-\;\*=]))[=\)\^][=\^]?)/))
         action { ["smiley", text] }

      when (text = @ss.scan(/(\@([a-zâàãäáæéèêëîïíìòœôöóûùüúçñÿA-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒŒÔÖÓÛÙÜÚÇÑŸ0-9]|_)+)/))
         action { ["twitter_username", "#TWITTERUSERNAME"] }

      when (text = @ss.scan(/(\#([a-zâàãäáæéèêëîïíìòœôöóûùüúçñÿA-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒŒÔÖÓÛÙÜÚÇÑŸ0-9]|'|_|-)+)/))
         action { ["twitter_hashtag", text] }

      when (text = @ss.scan(/['’´]/))
         action { ["apos", text] }

      when (text = @ss.scan(/[Nn]( )?(º|°)/))
         action { ["wtoken", text] }

      when (text = @ss.scan(/([0-9]+|(I|V|X|C|L|M|D)+)(st|nd|rd|th)/))
         action { ["ordinal", text] }

      when (text = @ss.scan(/[012]?[0-9][hH][012345][0-9]/))
         action { ["timex-time", text] }

      when (text = @ss.scan(/([0-9]+([Hh]|[Mm](i?n)?|[Ss](ec)?))+/))
         action { ["timex-chrono", text] }

      when (text = @ss.scan(/((I|V|X|C|L|M|D)(I|V|X|C|L|M|D)+)/))
         action { ["romain", text] }

      when (text = @ss.scan(/[0-9]+([',.\/ ][0-9]+)*/))
         action { ["cardinal", "#CARDINAL"] }

      when (text = @ss.scan(/([a-zâàãäáæéèêëîïíìòœôöóûùüúçñÿA-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒŒÔÖÓÛÙÜÚÇÑŸ]([.][a-zâàãäáæéèêëîïíìòœôöóûùüúçñÿA-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒŒÔÖÓÛÙÜÚÇÑŸ])+([.])?)/))
         action {["acronym", text] }

      when (text = @ss.scan(/[a-zâàãäáæéèêëîïíìòœôöóûùüúçñÿA-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒŒÔÖÓÛÙÜÚÇÑŸ0-9][a-zâàãäáæéèêëîïíìòœôöóûùüúçñÿA-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒŒÔÖÓÛÙÜÚÇÑŸ0-9]*/))
         action { ["wtoken", text] }

      when (text = @ss.scan(/(<[\/]?[^>]+>)/))
         action {["sgmltag", text] }

      when (text = @ss.scan(/([,;:"'”“(){}&]|«|»|’|—|--|…|‘)/))
         action {["ponctw", text] }

      when (text = @ss.scan(/[A-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒŒÔÖÓÛÙÜÚÇÑŸ][h]?[.]/))
         action {["initial", text] }

      when (text = @ss.scan(/(­|-)/))
         action {["hyphen", text] }

      when (text = @ss.scan(/([!?]+)|([.!?¿¡…])/))
         action {["poncts", text] }

      when (text = @ss.scan(/([\#$%&*+\/<=>@\[\\\]^_`|~]|–|¶|£|¥|₤|€|§|²|³|º|°|µ)/))
         action {["symbol", text] }

      when (text = @ss.scan(/(([A-Za-z0-9\-\._])+[@]([A-Za-z0-9\-\._])+([\.][A-Za-z0-9]+))/))
         action {["email", text] }

      when (text = @ss.scan(/[\t\n\v\f\r ]+/))
         action {["space", text] }

      when (text = @ss.scan(/./))
         action { ["no_match", text] }

      else
        text = @ss.string[@ss.pos .. -1]
        raise  ScanError, "can not match: '" + text + "'"
      end  # if

    else
      raise  ScanError, "undefined state: '" + state.to_s + "'"
    end  # case state
    token
  end  # def _next_token

def tokenize(code)
    scan_setup(code)
    tokens = []
    while token = next_token
        tokens.push( token )
    end
    tokens
end # class

end


def _accent(str)

    temp_word = str

    temp_word = temp_word.gsub("&Agrave;", "à")
    temp_word = temp_word.gsub("&agrave;", "à")
    temp_word = temp_word.gsub("&Acirc;", "â")
    temp_word = temp_word.gsub("&acirc;", "â")
    temp_word = temp_word.gsub("&Ccedil;", "ç")
    temp_word = temp_word.gsub("&ccedil;", "ç")
    temp_word = temp_word.gsub("&Egrave;", "è")
    temp_word = temp_word.gsub("&egrave;", "è")
    temp_word = temp_word.gsub("&Eacute;", "é")
    temp_word = temp_word.gsub("&eacute;", "é")
    temp_word = temp_word.gsub("&Ecirc;", "ê")
    temp_word = temp_word.gsub("&ecirc;", "ê")
    temp_word = temp_word.gsub("&Euml;", "ë")
    temp_word = temp_word.gsub("&euml;", "ë")
    temp_word = temp_word.gsub("&Icirc;", "î")
    temp_word = temp_word.gsub("&icirc;", "î")
    temp_word = temp_word.gsub("&Iuml;", "ï")
    temp_word = temp_word.gsub("&iuml;", "ï")
    temp_word = temp_word.gsub("&Ocirc;", "ô")
    temp_word = temp_word.gsub("&ocirc;", "ô")
    temp_word = temp_word.gsub("&OElig;", "œ")
    temp_word = temp_word.gsub("&oelig;", "œ")
    temp_word = temp_word.gsub("&Ugrave;", "ù")
    temp_word = temp_word.gsub("&ugrave;", "ù")
    temp_word = temp_word.gsub("&Ucirc;", "û")
    temp_word = temp_word.gsub("&ucirc;", "û")
    temp_word = temp_word.gsub("&Uuml;", "ü")
    temp_word = temp_word.gsub("&uuml;", "ü")
    temp_word = temp_word.gsub("&#376;", "ÿ")
    temp_word = temp_word.gsub("&yuml;", "ÿ")

    temp_word = temp_word.gsub("À", "à")
    temp_word = temp_word.gsub("Â", "â")
    temp_word = temp_word.gsub("Ç", "ç")
    temp_word = temp_word.gsub("È", "è")
    temp_word = temp_word.gsub("É", "é")
    temp_word = temp_word.gsub("Ê", "ê")
    temp_word = temp_word.gsub("Ë", "ë")
    temp_word = temp_word.gsub("Î", "î")
    temp_word = temp_word.gsub("Ï", "ï")
    temp_word = temp_word.gsub("Ô", "î")
    temp_word = temp_word.gsub("Œ", "œ")
    temp_word = temp_word.gsub("Ù", "ù")
    temp_word = temp_word.gsub("Û", "û")
    temp_word = temp_word.gsub("Ü", "ü")
    temp_word = temp_word.gsub("Ü", "ü")

    temp_word = temp_word.gsub("Ÿ", "ÿ")
    temp_word = temp_word.gsub("Ÿ", "ÿ")

    #Spanish
    temp_word = temp_word.gsub("Ñ", "ñ")
    temp_word = temp_word.gsub("Á", "á")
    temp_word = temp_word.gsub("Ó", "ó")
    temp_word = temp_word.gsub("Ú", "ú")
    temp_word = temp_word.gsub("Í", "í")

    return temp_word

end


def _html(str)
    temp_word = str

    temp_word = temp_word.gsub("&gt;", "<")
    temp_word = temp_word.gsub("&lt;", ">")

    return temp_word
end

def squeezeWhitespace(str)
    return str.gsub(/[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+/, " ")
end


def _downcase(str)
    return str.downcase
end

def _cleanword(string)
    return string.scan(/((.)\2*)/).map(&:first).map { |x| x[0..2] }.join("")
end

def _convert(string)
    temp_word = string

    temp_word = temp_word.gsub("â¬", "€")
    temp_word = temp_word.gsub("Ã¢", "â")
    temp_word = temp_word.gsub("Ã©", "é")
    temp_word = temp_word.gsub("Ã¨", "è")
    temp_word = temp_word.gsub("Ãª", "ê")
    temp_word = temp_word.gsub("Ã«", "ë")
    temp_word = temp_word.gsub("Ã®", "î")
    temp_word = temp_word.gsub("Ã¯", "ï")
    temp_word = temp_word.gsub("Ã¶", "ö")
    temp_word = temp_word.gsub("Ã¹", "ù")
    temp_word = temp_word.gsub("Ã»", "û")
    temp_word = temp_word.gsub("Ã¼", "ü")
    temp_word = temp_word.gsub("Ã§", "ç")
    temp_word = temp_word.gsub("Å", "œ")
    temp_word = temp_word.gsub("Â°", "°")
    temp_word = temp_word.gsub("Ã ", "à ")
    temp_word = temp_word.gsub("Ã", "ô")

    return temp_word
end

def all_caps(str)
    evaluator = TokenizerTwitter.new
    ar = evaluator.tokenize( _html( _accent( squeezeWhitespace( _cleanword( _convert( str ) ) )) ) )

    counter = 0

    ar.each do |x|
        if x[0] == "wtoken"
            if x[1].match(/^([A-Z]+)$/) 
                if x[1].upcase == x[1]
                    counter += 1
                end
            end
        end
    end

    return counter
end

def elongated_words(str)
    evaluator = TokenizerTwitter.new

    ar = evaluator.tokenize( _html( _accent( squeezeWhitespace( _convert( str ) ) ) )  )

    counter = 0

    ar.each do |x|
        if x[1] != _cleanword(x[1])
            counter += 1
        end
    end

    return counter
end

def last_tokens_emoticons?(str)
    evaluator = TokenizerTwitter.new
    ar = evaluator.tokenize( _html( _accent( squeezeWhitespace(_downcase(  _cleanword( _convert( str ) ) )) ) ) )
    return +1 if ar.last[0] == "smiley"
    return -1
end

def emoticons(str)
    evaluator = TokenizerTwitter.new
    ar = evaluator.tokenize( _html( _accent( squeezeWhitespace(_downcase(  _cleanword( _convert( str ) ) )) ) ) )
    counter = 0
    ar.each do |x|
        if x[0] == "smiley"
            counter += 1
        end
    end
    return counter
end

def last_tokens_url?(str)
    evaluator = TokenizerTwitter.new
    ar = evaluator.tokenize( _html( _accent( squeezeWhitespace(_downcase(  _cleanword( _convert( str ) ) )) ) ) )
    return +1 if ar.last[0] == "url"
    return -1
end

def ponctuation(str)
    ponct = Array.new(3, 0)
    evaluator = TokenizerTwitter.new
    ar = evaluator.tokenize( _html( _accent( squeezeWhitespace(_downcase(  _cleanword( _convert( str ) ) )) ) ) )
    ar.each do |x|
        if x[1].match(/^(\?\?+)$/)
            ponct[0] += 1
        end

        if x[1].match(/^(!!+)$/)
            ponct[1] += 1
        end

        if x[1].include?("?!") == true
            ponct[2] += 1
        elsif x[1].include?("!?") == true
            ponct[2] += 1
        else
            #nothing
        end

    end

    return ponct
end


def semi_tokenize(str)
    y = Array.new
    evaluator = TokenizerTwitter.new
    ar = evaluator.tokenize( _html( _accent( squeezeWhitespace(_downcase(  _cleanword( _convert( str ) ) )) ) ) )

    ar.each do |x|
        y.push(x[1])
    end

    return y.join(" ")
end

def tokenize_charliehebdo(str)
    hash_tags = Array.new
    y = Array.new
    evaluator = TokenizerTwitter.new
    ar = evaluator.tokenize( _html( _accent( squeezeWhitespace( _downcase(  _cleanword( _convert( str ) ) ) ) ) ) )

    ar.each do |x|
        if x[0] != "ponctw"
            if x[0] != "space"
                if x[0] != "sgmltag"
                    if x[0] != "hyphen"
                        if x[0] != "poncts"
                            if x[0] != "apos"
                                if x[0] != "symbol"
                                    #x[1] = "pour" if x[1] == "pr"
                                    #x[1] = "putain" if x[1] == "ptn"
                                    #x[1] = "maintenant" if x[1] == "mnt"
                                    #x[1] = "beaucoup" if x[1] == "bcp"


                                x[1] = "isn't" if x[1] == "isnt"
                                x[1] = "aren't" if x[1] == "arent"
                                x[1] = "wasn't" if x[1] == "wasnt"
                                x[1] = "weren't" if x[1] == "werent"
                                x[1] = "haven't" if x[1] == "havent"
                                x[1] = "hasn't" if x[1] == "hasnt"
                                x[1] = "hadn't" if x[1] == "hadnt"
                                x[1] = "won't" if x[1] == "wont"
                                x[1] = "wouldn't" if x[1] == "wouldnt"
                                x[1] = "don't" if x[1] == "dont"
                                x[1] = "doesn't" if x[1] == "doesnt"
                                x[1] = "didn't" if x[1] == "didnt"
                                x[1] = "can't" if x[1] == "cant"
                                x[1] = "couldn't" if x[1] == "couldnt"
                                x[1] = "shouldn't" if x[1] == "shouldnt"
                                x[1] = "mightn't" if x[1] == "mightnt"
                                x[1] = "mustn't" if x[1] == "mustnt"
                                    

                                    if x[0] == "twitter_hashtag"
                                        hash_tags.push( x[1] )
                                    end

                                    y.push(x[1])
                                end
                            end
                        end
                    end
                end
            end
        end
    end

    return y.join(" "), hash_tags
end



def tokenize(str)
    y = Array.new
    evaluator = TokenizerTwitter.new
    ar = evaluator.tokenize( _html( _accent( squeezeWhitespace( _downcase(  _cleanword( _convert( str ) ) ) ) ) ) )

    ar.each do |x|
        if x[0] != "ponctw"
            if x[0] != "space"
                if x[0] != "sgmltag"
                    if x[0] != "hyphen"
                        if x[0] != "poncts"
                            if x[0] != "apos"
                                if x[0] != "symbol"

                                x[1] = "isn't" if x[1] == "isnt"
                                x[1] = "aren't" if x[1] == "arent"
                                x[1] = "wasn't" if x[1] == "wasnt"
                                x[1] = "weren't" if x[1] == "werent"
                                x[1] = "haven't" if x[1] == "havent"
                                x[1] = "hasn't" if x[1] == "hasnt"
                                x[1] = "hadn't" if x[1] == "hadnt"
                                x[1] = "won't" if x[1] == "wont"
                                x[1] = "wouldn't" if x[1] == "wouldnt"
                                x[1] = "don't" if x[1] == "dont"
                                x[1] = "doesn't" if x[1] == "doesnt"
                                x[1] = "didn't" if x[1] == "didnt"
                                x[1] = "can't" if x[1] == "cant"
                                x[1] = "couldn't" if x[1] == "couldnt"
                                x[1] = "shouldn't" if x[1] == "shouldnt"
                                x[1] = "mightn't" if x[1] == "mightnt"
                                x[1] = "mustn't" if x[1] == "mustnt"
                                    

                                    if x[1] != ".."
                                        if x[1] != "..."
                                            y.push(x[1])
                                        end
                                    end
                                end
                            end
                        end
                    end
                end
            end
        end
    end

    return y.join(" ")
end

