Commit ca96fb31f8d5fe261716907d190363d6492d29ff
1 parent
b55f47b385
Exists in
master
exec method for Indri
Showing 3 changed files with 52 additions and 46 deletions Side-by-side Diff
lib/mirimiri/document.rb
... | ... | @@ -142,7 +142,7 @@ |
142 | 142 | end |
143 | 143 | |
144 | 144 | def self.get_url(name) |
145 | - raise ArgumentError, "Bad encoding", name unless name.isutf8 | |
145 | + raise ArgumentError, "Bad encoding", name unless name.isutf9 | |
146 | 146 | |
147 | 147 | atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).unaccent.toutf8).elements['api/query/pages/page'].attributes |
148 | 148 |
lib/mirimiri/query.rb
lib/mirimiri/string.rb
... | ... | @@ -23,48 +23,50 @@ |
23 | 23 | |
24 | 24 | # These are the default stopwords provided by Lemur. |
25 | 25 | Stoplist = [ |
26 | - "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av", | |
27 | - "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", | |
28 | - "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", | |
29 | - "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu", | |
30 | - "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during", | |
31 | - "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every", | |
32 | - "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting", | |
33 | - "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff", | |
34 | - "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore", | |
35 | - "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he", | |
36 | - "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto", | |
37 | - "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto", | |
38 | - "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include", | |
39 | - "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into", | |
40 | - "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last", | |
41 | - "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe", | |
42 | - "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs", | |
43 | - "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless", | |
44 | - "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing", | |
45 | - "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once", | |
46 | - "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", | |
47 | - "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite", | |
48 | - "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing", | |
49 | - "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt", | |
50 | - "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote", | |
51 | - "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", | |
52 | - "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave", | |
53 | - "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them", | |
54 | - "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts", | |
55 | - "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", | |
56 | - "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru", | |
57 | - "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh", | |
58 | - "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward", | |
59 | - "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week", | |
60 | - "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", | |
61 | - "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore", | |
62 | - "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", | |
63 | - "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever", | |
64 | - "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom", | |
65 | - "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within", | |
66 | - "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your", | |
67 | - "yours", "yourself", "yourselves" | |
26 | +"a","about","above","according","across","after","afterwards","again","against", | |
27 | +"albeit","all","almost","alone","along","already","also","although","always","am", | |
28 | +"among","amongst","an","and","another","any","anybody","anyhow","anyone","anything", | |
29 | +"anyway","anywhere","apart","are","around","as","at","av","be","became","because", | |
30 | +"become","becomes","becoming","been","before","beforehand","behind","being","below", | |
31 | +"beside","besides","between","beyond","both","but","by","can","cannot","canst", | |
32 | +"certain","cf","choose","contrariwise","cos","could","cu","day","do","does","doesn't", | |
33 | +"doing","dost","doth","double","down","dual","during","each","either","else", | |
34 | +"elsewhere","enough","et","etc","even","ever","every","everybody","everyone", | |
35 | +"everything","everywhere","except","excepted","excepting","exception","exclude", | |
36 | +"excluding","exclusive","far","farther","farthest","few","ff","first","for", | |
37 | +"formerly","forth","forward","from","front","further","furthermore","furthest","get", | |
38 | +"go","had","halves","hardly","has","hast","hath","have","he","hence","henceforth", | |
39 | +"her","here","hereabouts","hereafter","hereby","herein","hereto","hereupon","hers", | |
40 | +"herself","him","himself","hindmost","his","hither","hitherto","how","however", | |
41 | +"howsoever","i","ie","if","in","inasmuch","inc","include","included","including", | |
42 | +"indeed","indoors","inside","insomuch","instead","into","inward","inwards","is", | |
43 | +"it","its","itself","just","kind","kg","km","last","latter","latterly","less","lest", | |
44 | +"let","like","little","ltd","many","may","maybe","me","meantime","meanwhile","might", | |
45 | +"moreover","most","mostly","more","mr","mrs","ms","much","must","my","myself", | |
46 | +"namely","need","neither","never","nevertheless","next","no","nobody","none", | |
47 | +"nonetheless","noone","nope","nor","not","nothing","notwithstanding","now","nowadays", | |
48 | +"nowhere","of","off","often","ok","on","once","one","only","onto","or","other", | |
49 | +"others","otherwise","ought","our","ours","ourselves","out","outside","over","own", | |
50 | +"per","perhaps","plenty","provide","quite","rather","really","round","said","sake", | |
51 | +"same","sang","save","saw","see","seeing","seem","seemed","seeming","seems","seen", | |
52 | +"seldom","selves","sent","several","shalt","she","should","shown","sideways","since", | |
53 | +"slept","slew","slung","slunk","smote","so","some","somebody","somehow","someone", | |
54 | +"something","sometime","sometimes","somewhat","somewhere","spake","spat","spoke", | |
55 | +"spoken","sprang","sprung","stave","staves","still","such","supposing","than","that", | |
56 | +"the","thee","their","them","themselves","then","thence","thenceforth","there", | |
57 | +"thereabout","thereabouts","thereafter","thereby","therefore","therein","thereof", | |
58 | +"thereon","thereto","thereupon","these","they","this","those","thou","though", | |
59 | +"thrice","through","throughout","thru","thus","thy","thyself","till","to","together", | |
60 | +"too","toward","towards","ugh","unable","under","underneath","unless","unlike", | |
61 | +"until","up","upon","upward","upwards","us","use","used","using","very","via","vs", | |
62 | +"want","was","we","week","well","were","what","whatever","whatsoever","when","whence", | |
63 | +"whenever","whensoever","where","whereabouts","whereafter","whereas","whereat", | |
64 | +"whereby","wherefore","wherefrom","wherein","whereinto","whereof","whereon", | |
65 | +"wheresoever","whereto","whereunto","whereupon","wherever","wherewith","whether", | |
66 | +"whew","which","whichever","whichsoever","while","whilst","whither","who","whoa", | |
67 | +"whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will", | |
68 | +"wilt","with","within","without","worse","worst","would","wow","ye","yet","year", | |
69 | +"yippee","you","your","yours","yourself","yourselves" | |
68 | 70 | ] |
69 | 71 | |
70 | 72 | Transmap = { |
... | ... | @@ -144,7 +146,7 @@ |
144 | 146 | "\xCA\xBF" => "\x27", # c-single quote |
145 | 147 | "\xCC\xA8" => "", # modifier - under curve |
146 | 148 | "\xCC\xB1" => "", # modifier - under line |
147 | - /\W/ => "" | |
149 | +# /\W/ => "" | |
148 | 150 | } |
149 | 151 | |
150 | 152 | end |
... | ... | @@ -250,8 +252,8 @@ |
250 | 252 | |
251 | 253 | def strip_with_pattern(pattern) |
252 | 254 | require 'cgi' |
253 | - require 'kconv' | |
254 | - CGI::unescapeHTML(self.gsub(pattern,"")).toutf8 | |
255 | + | |
256 | + CGI::unescapeHTML(self.gsub(pattern,"")).unaccent.encode("UTF-8", {:invalid => :replace, :undef => :replace, :replace => " "}) | |
255 | 257 | end |
256 | 258 | |
257 | 259 | private :strip_with_pattern |