class TokenizerTwitter macro LATINSMALL [a-zâàãäáæéèêëîïíìòœôöóûùüúçñÿ] LATINCAPITAL [A-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒŒÔÖÓÛÙÜÚÇÑŸ] LATIN [a-zâàãäáæéèêëîïíìòœôöóûùüúçñÿA-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒŒÔÖÓÛÙÜÚÇÑŸ] NOTLATIN [^a-zâàãäáæéèêëîïíìòœôöóûùüúçñÿA-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒÔÖÓŒÛÙÜÚÇÑŸ] DIGIT [0-9] ALPHADIGIT [a-zâàãäáæéèêëîïíìòœôöóûùüúçñÿA-ZÂÀÃÄÁÆÉÈÊËÎÏÍÌÒŒÔÖÓÛÙÜÚÇÑŸ0-9] USERNAME (\@({ALPHADIGIT}|_)+) HASHTAG (\#({ALPHADIGIT}|'|_|-)+) SMILEY (([<>]?[:;=8][\-o\*\']?[\)\]\(\[dDpP\/\:\}\{@\|\\]{1,3})|([\)\]\(\[dDpP\/\:\}\{@\|\\][\-o\*\']?[:;=8][<>]?)) SMILEY_HEAD_BASE ((°|[\^\+\.\-\;\*=])([_~\-\-\^\.]){0,3}(°|[\^\+\.\-\;\*=])) SMILEY_HEAD ([\^=]?[\^\(=]{SMILEY_HEAD_BASE}[=\)\^][=\^]?) HEART (<(3)+) CONTROL [\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F] IGNORABLE [\x01\x02\x04\x05\x06\x07\x08\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F\x7F\xA0] SPACE [\t\n\v\f\r\ ] SUFIX (-(ce|CE|y|Y|en|EN|l(a|à|es?)|L(AÀ|ES?)|même|MÊME|[mts]oi|[MTS]OI|je|JE|tu|TU|on|ON|[nv]ous|[NV]OUS|lui|LUI|ils?|ILS?|elles?|ELLES?|(t-(ils?|elles?|on)|T-(ILS?|ELLES?|ON)))) WPONCT ([,;:"'”“(){}&]|«|»|’|—|--|…|‘) SPONCT_MULTIPLE ([!?]+) SPONCT ([.!?¿¡…]) SYMBOL ([\#$%&*+\/<=>@\[\\\]^_`|~]|–|¶|£|¥|₤|€|§|²|³|º|°|µ) ACRONYM ({LATIN}([.]{LATIN})+([.])?) HYPHEN (­|-) URLChar ([:\/?\#\[\]@!%\-$&'()*+,;=_~.a-zA-Z0-9]) URLScheme ([a-zA-Z][a-zA-Z+\-.]*) URLRest (([a-zA-Z0-9]{URLChar}*[a-zA-Z0-9])[.]([a-zA-Z0-9]{URLChar}*[a-zA-Z0-9])) URL (({URLScheme}?:?([\/][\/])?)?({URLRest})|{URLRest}) EMAILChar ([A-Za-z0-9\-\._]) EMAILDomain ([\.][A-Za-z0-9]+) EMAIL ({EMAILChar}+[@]{EMAILChar}+{EMAILDomain}) SGMLTag (<[\/]?[^>]+>) ENDOFTEXT (\x03) EXCEPTION (Ce|Vers|Me|Le|De|Les|Des|Ces|Mère|Mer) ROMAIN (I|V|X|C|L|M|D) ABREVIATION (Mme|Mlle|Mr|M\.|Mr\.|Mrs\.) VINGT_A_SOIXANTE (vingt|trente|quarante|cinquante|soixante) VINGT_A_SOIXANTE_ET_QUATRE_VINGT (vingt|trente|quarante|cinquante|soixante|quatre(\ |-)vingts?) VINGT_A_QUATRE_VINGT_DIX (vingt|trente|quarante|cinquante|soixante|soixante(\ |-)dix|quatre(\ |-)vingts?|quatre(\ |-)vingt-dix) DEUX_A_NEUF (deux|trois|quatre|cinq|six|sept|huit|neuf) DIX_A_DIX_NEUF (dix|onze|douze|treize|quatorze|quinze|seize|dix(\ |-)sept|dix(\ |-)huit|dix(\ |-)neuf) DOUZE_A_DIX_NEUF (douze|treize|quatorze|quinze|seize|dix(\ |-)sept|dix(\ |-)huit|dix(\ |-)neuf) SOIXANTE_ET_QUATRE_VINGT (soixante|quatre(\ |-)vingts?) DEUX_A_QUATRE_VINGT_DIX_NEUF ({DEUX_A_NEUF}|{DIX_A_DIX_NEUF}|{VINGT_A_QUATRE_VINGT_DIX}|{VINGT_A_SOIXANTE}(\ |-)et(\ |-)un|soixante(\ |-)et(\ |-)onze|quatre(\ |-)vingts?(\ |-)un|quatre(\ |-)vingt(\ |-)un|quatre(\ |-)vingts?(\ |-)onze|quatre(\ |-)vingts?(\ |-)onze|{VINGT_A_SOIXANTE_ET_QUATRE_VINGT}(\ |-){DEUX_A_NEUF}|{SOIXANTE_ET_QUATRE_VINGT}(\ |-){DOUZE_A_DIX_NEUF}) DEUX_A_NEUF_CENT_QUATRE_VINGT_DIX_NEUF ({DEUX_A_QUATRE_VINGT_DIX_NEUF}|cent(\ |-){DEUX_A_QUATRE_VINGT_DIX_NEUF}|{DEUX_A_NEUF}(\ |-)cents?(\ |-){DEUX_A_QUATRE_VINGT_DIX_NEUF}|{DEUX_A_NEUF}(\ |-)cents) DEUX_A_999999 ({DEUX_A_NEUF_CENT_QUATRE_VINGT_DIX_NEUF}|mille|{DEUX_A_NEUF_CENT_QUATRE_VINGT_DIX_NEUF}(\ |-)mille|mille(\ |-){DEUX_A_NEUF_CENT_QUATRE_VINGT_DIX_NEUF}|{DEUX_A_NEUF_CENT_QUATRE_VINGT_DIX_NEUF}(\ |-)mille(\ |-){DEUX_A_NEUF_CENT_QUATRE_VINGT_DIX_NEUF}) APOS ['’´] NUMERO [Nn](\ )?(º|°) CARDINAL {DIGIT}+([',.\/\ ]{DIGIT}+)* NEGATION ((isn|aren|wasn|weren|haven|hasn|hadn|won|wouldn|don|doesn|didn|can|couldn|shouldn|mightn|mustn)'t) rule & { ["wtoken", "&"] } {NEGATION} { ["wtoken_negation", text] } {URL} {["url", "#URL"] } {HEART} { ["heart", text] } {SMILEY} { ["smiley", text] } {SMILEY_HEAD_BASE}|{SMILEY_HEAD} { ["smiley", text] } {USERNAME} { ["twitter_username", "#TWITTERUSERNAME"] } {HASHTAG} { ["twitter_hashtag", text] } #{DEUX_A_999999} { ["rcardinal", "#CHIFFRE"] } {APOS} { ["apos", text] } {NUMERO} { ["wtoken", text] } ({DIGIT}+|(I|V|X|C|L|M|D)+)(st|nd|rd|th) { ["ordinal", text] } [012]?{DIGIT}[hH][012345]{DIGIT} { ["timex-time", text] } ({DIGIT}+([Hh]|[Mm](i?n)?|[Ss](ec)?))+ { ["timex-chrono", text] } ({ROMAIN}{ROMAIN}+) { ["romain", text] } {CARDINAL} { ["cardinal", "#CARDINAL"] } {ACRONYM} {["acronym", text] } {ALPHADIGIT}{ALPHADIGIT}* { ["wtoken", text] } {SGMLTag} {["sgmltag", text] } {WPONCT} {["ponctw", text] } {LATINCAPITAL}[h]?[.] {["initial", text] } {HYPHEN} {["hyphen", text] } {SPONCT_MULTIPLE}|{SPONCT} {["poncts", text] } {SYMBOL} {["symbol", text] } {EMAIL} {["email", text] } {SPACE}+ {["space", text] } . { ["no_match", text] } inner def tokenize(code) scan_setup(code) tokens = [] while token = next_token tokens.push( token ) end tokens end end def _accent(str) temp_word = str temp_word = temp_word.gsub("À", "à") temp_word = temp_word.gsub("à", "à") temp_word = temp_word.gsub("Â", "â") temp_word = temp_word.gsub("â", "â") temp_word = temp_word.gsub("Ç", "ç") temp_word = temp_word.gsub("ç", "ç") temp_word = temp_word.gsub("È", "è") temp_word = temp_word.gsub("è", "è") temp_word = temp_word.gsub("É", "é") temp_word = temp_word.gsub("é", "é") temp_word = temp_word.gsub("Ê", "ê") temp_word = temp_word.gsub("ê", "ê") temp_word = temp_word.gsub("Ë", "ë") temp_word = temp_word.gsub("ë", "ë") temp_word = temp_word.gsub("Î", "î") temp_word = temp_word.gsub("î", "î") temp_word = temp_word.gsub("Ï", "ï") temp_word = temp_word.gsub("ï", "ï") temp_word = temp_word.gsub("Ô", "ô") temp_word = temp_word.gsub("ô", "ô") temp_word = temp_word.gsub("Œ", "œ") temp_word = temp_word.gsub("œ", "œ") temp_word = temp_word.gsub("Ù", "ù") temp_word = temp_word.gsub("ù", "ù") temp_word = temp_word.gsub("Û", "û") temp_word = temp_word.gsub("û", "û") temp_word = temp_word.gsub("Ü", "ü") temp_word = temp_word.gsub("ü", "ü") temp_word = temp_word.gsub("Ÿ", "ÿ") temp_word = temp_word.gsub("ÿ", "ÿ") temp_word = temp_word.gsub("À", "à") temp_word = temp_word.gsub("Â", "â") temp_word = temp_word.gsub("Ç", "ç") temp_word = temp_word.gsub("È", "è") temp_word = temp_word.gsub("É", "é") temp_word = temp_word.gsub("Ê", "ê") temp_word = temp_word.gsub("Ë", "ë") temp_word = temp_word.gsub("Î", "î") temp_word = temp_word.gsub("Ï", "ï") temp_word = temp_word.gsub("Ô", "î") temp_word = temp_word.gsub("Œ", "œ") temp_word = temp_word.gsub("Ù", "ù") temp_word = temp_word.gsub("Û", "û") temp_word = temp_word.gsub("Ü", "ü") temp_word = temp_word.gsub("Ü", "ü") temp_word = temp_word.gsub("Ÿ", "ÿ") temp_word = temp_word.gsub("Ÿ", "ÿ") #Spanish temp_word = temp_word.gsub("Ñ", "ñ") temp_word = temp_word.gsub("Á", "á") temp_word = temp_word.gsub("Ó", "ó") temp_word = temp_word.gsub("Ú", "ú") temp_word = temp_word.gsub("Í", "í") return temp_word end def _html(str) temp_word = str temp_word = temp_word.gsub(">", "<") temp_word = temp_word.gsub("<", ">") return temp_word end def squeezeWhitespace(str) return str.gsub(/[\s\u0020\u00a0\u1680\u180e\u202f\u205f\u3000\u2000-\u200a]+/, " ") end def _downcase(str) return str.downcase end def _cleanword(string) return string.scan(/((.)\2*)/).map(&:first).map { |x| x[0..2] }.join("") end def _convert(string) temp_word = string temp_word = temp_word.gsub("€", "€") temp_word = temp_word.gsub("â", "â") temp_word = temp_word.gsub("é", "é") temp_word = temp_word.gsub("è", "è") temp_word = temp_word.gsub("ê", "ê") temp_word = temp_word.gsub("ë", "ë") temp_word = temp_word.gsub("î", "î") temp_word = temp_word.gsub("ï", "ï") temp_word = temp_word.gsub("ö", "ö") temp_word = temp_word.gsub("ù", "ù") temp_word = temp_word.gsub("û", "û") temp_word = temp_word.gsub("ü", "ü") temp_word = temp_word.gsub("ç", "ç") temp_word = temp_word.gsub("Å", "œ") temp_word = temp_word.gsub("°", "°") temp_word = temp_word.gsub("à ", "à ") temp_word = temp_word.gsub("Ã", "ô") return temp_word end def all_caps(str) evaluator = TokenizerTwitter.new ar = evaluator.tokenize( _html( _accent( squeezeWhitespace( _cleanword( _convert( str ) ) )) ) ) counter = 0 ar.each do |x| if x[0] == "wtoken" if x[1].match(/^([A-Z]+)$/) if x[1].upcase == x[1] counter += 1 end end end end return counter end def elongated_words(str) evaluator = TokenizerTwitter.new ar = evaluator.tokenize( _html( _accent( squeezeWhitespace( _convert( str ) ) ) ) ) counter = 0 ar.each do |x| if x[1] != _cleanword(x[1]) counter += 1 end end return counter end def last_tokens_emoticons?(str) evaluator = TokenizerTwitter.new ar = evaluator.tokenize( _html( _accent( squeezeWhitespace(_downcase( _cleanword( _convert( str ) ) )) ) ) ) return +1 if ar.last[0] == "smiley" return -1 end def emoticons(str) evaluator = TokenizerTwitter.new ar = evaluator.tokenize( _html( _accent( squeezeWhitespace(_downcase( _cleanword( _convert( str ) ) )) ) ) ) counter = 0 ar.each do |x| if x[0] == "smiley" counter += 1 end end return counter end def last_tokens_url?(str) evaluator = TokenizerTwitter.new ar = evaluator.tokenize( _html( _accent( squeezeWhitespace(_downcase( _cleanword( _convert( str ) ) )) ) ) ) return +1 if ar.last[0] == "url" return -1 end def ponctuation(str) ponct = Array.new(3, 0) evaluator = TokenizerTwitter.new ar = evaluator.tokenize( _html( _accent( squeezeWhitespace(_downcase( _cleanword( _convert( str ) ) )) ) ) ) ar.each do |x| if x[1].match(/^(\?\?+)$/) ponct[0] += 1 end if x[1].match(/^(!!+)$/) ponct[1] += 1 end if x[1].include?("?!") == true ponct[2] += 1 elsif x[1].include?("!?") == true ponct[2] += 1 else #nothing end end return ponct end def semi_tokenize(str) y = Array.new evaluator = TokenizerTwitter.new ar = evaluator.tokenize( _html( _accent( squeezeWhitespace(_downcase( _cleanword( _convert( str ) ) )) ) ) ) ar.each do |x| y.push(x[1]) end return y.join(" ") end def tokenize_charliehebdo(str) hash_tags = Array.new y = Array.new evaluator = TokenizerTwitter.new ar = evaluator.tokenize( _html( _accent( squeezeWhitespace( _downcase( _cleanword( _convert( str ) ) ) ) ) ) ) ar.each do |x| if x[0] != "ponctw" if x[0] != "space" if x[0] != "sgmltag" if x[0] != "hyphen" if x[0] != "poncts" if x[0] != "apos" if x[0] != "symbol" #x[1] = "pour" if x[1] == "pr" #x[1] = "putain" if x[1] == "ptn" #x[1] = "maintenant" if x[1] == "mnt" #x[1] = "beaucoup" if x[1] == "bcp" x[1] = "isn't" if x[1] == "isnt" x[1] = "aren't" if x[1] == "arent" x[1] = "wasn't" if x[1] == "wasnt" x[1] = "weren't" if x[1] == "werent" x[1] = "haven't" if x[1] == "havent" x[1] = "hasn't" if x[1] == "hasnt" x[1] = "hadn't" if x[1] == "hadnt" x[1] = "won't" if x[1] == "wont" x[1] = "wouldn't" if x[1] == "wouldnt" x[1] = "don't" if x[1] == "dont" x[1] = "doesn't" if x[1] == "doesnt" x[1] = "didn't" if x[1] == "didnt" x[1] = "can't" if x[1] == "cant" x[1] = "couldn't" if x[1] == "couldnt" x[1] = "shouldn't" if x[1] == "shouldnt" x[1] = "mightn't" if x[1] == "mightnt" x[1] = "mustn't" if x[1] == "mustnt" if x[0] == "twitter_hashtag" hash_tags.push( x[1] ) end y.push(x[1]) end end end end end end end end return y.join(" "), hash_tags end def tokenize(str) y = Array.new evaluator = TokenizerTwitter.new ar = evaluator.tokenize( _html( _accent( squeezeWhitespace( _downcase( _cleanword( _convert( str ) ) ) ) ) ) ) ar.each do |x| if x[0] != "ponctw" if x[0] != "space" if x[0] != "sgmltag" if x[0] != "hyphen" if x[0] != "poncts" if x[0] != "apos" if x[0] != "symbol" x[1] = "isn't" if x[1] == "isnt" x[1] = "aren't" if x[1] == "arent" x[1] = "wasn't" if x[1] == "wasnt" x[1] = "weren't" if x[1] == "werent" x[1] = "haven't" if x[1] == "havent" x[1] = "hasn't" if x[1] == "hasnt" x[1] = "hadn't" if x[1] == "hadnt" x[1] = "won't" if x[1] == "wont" x[1] = "wouldn't" if x[1] == "wouldnt" x[1] = "don't" if x[1] == "dont" x[1] = "doesn't" if x[1] == "doesnt" x[1] = "didn't" if x[1] == "didnt" x[1] = "can't" if x[1] == "cant" x[1] = "couldn't" if x[1] == "couldnt" x[1] = "shouldn't" if x[1] == "shouldnt" x[1] = "mightn't" if x[1] == "mightnt" x[1] = "mustn't" if x[1] == "mustnt" if x[1] != ".." if x[1] != "..." y.push(x[1]) end end end end end end end end end end return y.join(" ") end