diff --git a/lib/mirimiri/document.rb b/lib/mirimiri/document.rb index 97af4ca..af57bb7 100644 --- a/lib/mirimiri/document.rb +++ b/lib/mirimiri/document.rb @@ -142,7 +142,7 @@ module Mirimiri end def self.get_url(name) - raise ArgumentError, "Bad encoding", name unless name.isutf8 + raise ArgumentError, "Bad encoding", name unless name.isutf9 atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).unaccent.toutf8).elements['api/query/pages/page'].attributes diff --git a/lib/mirimiri/query.rb b/lib/mirimiri/query.rb index 511cf43..66d0887 100644 --- a/lib/mirimiri/query.rb +++ b/lib/mirimiri/query.rb @@ -72,6 +72,10 @@ module Indri h end + + def exec params + `IndriRunQuery -query='#{@query}' -index=#{params.index_path} -count=#{params.count} -rule=method:dirichlet,mu:2500 -trecFormat` + end end class IndriQueries diff --git a/lib/mirimiri/string.rb b/lib/mirimiri/string.rb index 13eee6c..c72d215 100644 --- a/lib/mirimiri/string.rb +++ b/lib/mirimiri/string.rb @@ -23,48 +23,50 @@ module Mirimiri # These are the default stopwords provided by Lemur. Stoplist = [ - "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av", - "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", - "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", - "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu", - "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during", - "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every", - "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting", - "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff", - "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore", - "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he", - "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto", - "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto", - "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include", - "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into", - "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last", - "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe", - "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs", - "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless", - "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing", - "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once", - "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", - "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite", - "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing", - "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt", - "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote", - "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", - "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave", - "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them", - "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts", - "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", - "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru", - "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh", - "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward", - "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week", - "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", - "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore", - "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", - "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever", - "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom", - "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within", - "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your", - "yours", "yourself", "yourselves" +"a","about","above","according","across","after","afterwards","again","against", +"albeit","all","almost","alone","along","already","also","although","always","am", +"among","amongst","an","and","another","any","anybody","anyhow","anyone","anything", +"anyway","anywhere","apart","are","around","as","at","av","be","became","because", +"become","becomes","becoming","been","before","beforehand","behind","being","below", +"beside","besides","between","beyond","both","but","by","can","cannot","canst", +"certain","cf","choose","contrariwise","cos","could","cu","day","do","does","doesn't", +"doing","dost","doth","double","down","dual","during","each","either","else", +"elsewhere","enough","et","etc","even","ever","every","everybody","everyone", +"everything","everywhere","except","excepted","excepting","exception","exclude", +"excluding","exclusive","far","farther","farthest","few","ff","first","for", +"formerly","forth","forward","from","front","further","furthermore","furthest","get", +"go","had","halves","hardly","has","hast","hath","have","he","hence","henceforth", +"her","here","hereabouts","hereafter","hereby","herein","hereto","hereupon","hers", +"herself","him","himself","hindmost","his","hither","hitherto","how","however", +"howsoever","i","ie","if","in","inasmuch","inc","include","included","including", +"indeed","indoors","inside","insomuch","instead","into","inward","inwards","is", +"it","its","itself","just","kind","kg","km","last","latter","latterly","less","lest", +"let","like","little","ltd","many","may","maybe","me","meantime","meanwhile","might", +"moreover","most","mostly","more","mr","mrs","ms","much","must","my","myself", +"namely","need","neither","never","nevertheless","next","no","nobody","none", +"nonetheless","noone","nope","nor","not","nothing","notwithstanding","now","nowadays", +"nowhere","of","off","often","ok","on","once","one","only","onto","or","other", +"others","otherwise","ought","our","ours","ourselves","out","outside","over","own", +"per","perhaps","plenty","provide","quite","rather","really","round","said","sake", +"same","sang","save","saw","see","seeing","seem","seemed","seeming","seems","seen", +"seldom","selves","sent","several","shalt","she","should","shown","sideways","since", +"slept","slew","slung","slunk","smote","so","some","somebody","somehow","someone", +"something","sometime","sometimes","somewhat","somewhere","spake","spat","spoke", +"spoken","sprang","sprung","stave","staves","still","such","supposing","than","that", +"the","thee","their","them","themselves","then","thence","thenceforth","there", +"thereabout","thereabouts","thereafter","thereby","therefore","therein","thereof", +"thereon","thereto","thereupon","these","they","this","those","thou","though", +"thrice","through","throughout","thru","thus","thy","thyself","till","to","together", +"too","toward","towards","ugh","unable","under","underneath","unless","unlike", +"until","up","upon","upward","upwards","us","use","used","using","very","via","vs", +"want","was","we","week","well","were","what","whatever","whatsoever","when","whence", +"whenever","whensoever","where","whereabouts","whereafter","whereas","whereat", +"whereby","wherefore","wherefrom","wherein","whereinto","whereof","whereon", +"wheresoever","whereto","whereunto","whereupon","wherever","wherewith","whether", +"whew","which","whichever","whichsoever","while","whilst","whither","who","whoa", +"whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will", +"wilt","with","within","without","worse","worst","would","wow","ye","yet","year", +"yippee","you","your","yours","yourself","yourselves" ] Transmap = { @@ -144,7 +146,7 @@ module Mirimiri "\xCA\xBF" => "\x27", # c-single quote "\xCC\xA8" => "", # modifier - under curve "\xCC\xB1" => "", # modifier - under line - /\W/ => "" +# /\W/ => "" } end @@ -250,8 +252,8 @@ class String def strip_with_pattern(pattern) require 'cgi' - require 'kconv' - CGI::unescapeHTML(self.gsub(pattern,"")).toutf8 + + CGI::unescapeHTML(self.gsub(pattern,"")).unaccent.encode("UTF-8", {:invalid => :replace, :undef => :replace, :replace => " "}) end private :strip_with_pattern