Commit ca96fb31f8d5fe261716907d190363d6492d29ff

Authored by romain
1 parent b55f47b385
Exists in master

exec method for Indri

Showing 3 changed files with 52 additions and 46 deletions Side-by-side Diff

lib/mirimiri/document.rb
... ... @@ -142,7 +142,7 @@
142 142 end
143 143  
144 144 def self.get_url(name)
145   - raise ArgumentError, "Bad encoding", name unless name.isutf8
  145 + raise ArgumentError, "Bad encoding", name unless name.isutf9
146 146  
147 147 atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).unaccent.toutf8).elements['api/query/pages/page'].attributes
148 148  
lib/mirimiri/query.rb
... ... @@ -72,6 +72,10 @@
72 72  
73 73 h
74 74 end
  75 +
  76 + def exec params
  77 + `IndriRunQuery -query='#{@query}' -index=#{params.index_path} -count=#{params.count} -rule=method:dirichlet,mu:2500 -trecFormat`
  78 + end
75 79 end
76 80  
77 81 class IndriQueries
lib/mirimiri/string.rb
... ... @@ -23,48 +23,50 @@
23 23  
24 24 # These are the default stopwords provided by Lemur.
25 25 Stoplist = [
26   - "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av",
27   - "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand",
28   - "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by",
29   - "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu",
30   - "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during",
31   - "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every",
32   - "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting",
33   - "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff",
34   - "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore",
35   - "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he",
36   - "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto",
37   - "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto",
38   - "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include",
39   - "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into",
40   - "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last",
41   - "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe",
42   - "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs",
43   - "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless",
44   - "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing",
45   - "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once",
46   - "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours",
47   - "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite",
48   - "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing",
49   - "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt",
50   - "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote",
51   - "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes",
52   - "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave",
53   - "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them",
54   - "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts",
55   - "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon",
56   - "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru",
57   - "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh",
58   - "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward",
59   - "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week",
60   - "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever",
61   - "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore",
62   - "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto",
63   - "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever",
64   - "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom",
65   - "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within",
66   - "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your",
67   - "yours", "yourself", "yourselves"
  26 +"a","about","above","according","across","after","afterwards","again","against",
  27 +"albeit","all","almost","alone","along","already","also","although","always","am",
  28 +"among","amongst","an","and","another","any","anybody","anyhow","anyone","anything",
  29 +"anyway","anywhere","apart","are","around","as","at","av","be","became","because",
  30 +"become","becomes","becoming","been","before","beforehand","behind","being","below",
  31 +"beside","besides","between","beyond","both","but","by","can","cannot","canst",
  32 +"certain","cf","choose","contrariwise","cos","could","cu","day","do","does","doesn't",
  33 +"doing","dost","doth","double","down","dual","during","each","either","else",
  34 +"elsewhere","enough","et","etc","even","ever","every","everybody","everyone",
  35 +"everything","everywhere","except","excepted","excepting","exception","exclude",
  36 +"excluding","exclusive","far","farther","farthest","few","ff","first","for",
  37 +"formerly","forth","forward","from","front","further","furthermore","furthest","get",
  38 +"go","had","halves","hardly","has","hast","hath","have","he","hence","henceforth",
  39 +"her","here","hereabouts","hereafter","hereby","herein","hereto","hereupon","hers",
  40 +"herself","him","himself","hindmost","his","hither","hitherto","how","however",
  41 +"howsoever","i","ie","if","in","inasmuch","inc","include","included","including",
  42 +"indeed","indoors","inside","insomuch","instead","into","inward","inwards","is",
  43 +"it","its","itself","just","kind","kg","km","last","latter","latterly","less","lest",
  44 +"let","like","little","ltd","many","may","maybe","me","meantime","meanwhile","might",
  45 +"moreover","most","mostly","more","mr","mrs","ms","much","must","my","myself",
  46 +"namely","need","neither","never","nevertheless","next","no","nobody","none",
  47 +"nonetheless","noone","nope","nor","not","nothing","notwithstanding","now","nowadays",
  48 +"nowhere","of","off","often","ok","on","once","one","only","onto","or","other",
  49 +"others","otherwise","ought","our","ours","ourselves","out","outside","over","own",
  50 +"per","perhaps","plenty","provide","quite","rather","really","round","said","sake",
  51 +"same","sang","save","saw","see","seeing","seem","seemed","seeming","seems","seen",
  52 +"seldom","selves","sent","several","shalt","she","should","shown","sideways","since",
  53 +"slept","slew","slung","slunk","smote","so","some","somebody","somehow","someone",
  54 +"something","sometime","sometimes","somewhat","somewhere","spake","spat","spoke",
  55 +"spoken","sprang","sprung","stave","staves","still","such","supposing","than","that",
  56 +"the","thee","their","them","themselves","then","thence","thenceforth","there",
  57 +"thereabout","thereabouts","thereafter","thereby","therefore","therein","thereof",
  58 +"thereon","thereto","thereupon","these","they","this","those","thou","though",
  59 +"thrice","through","throughout","thru","thus","thy","thyself","till","to","together",
  60 +"too","toward","towards","ugh","unable","under","underneath","unless","unlike",
  61 +"until","up","upon","upward","upwards","us","use","used","using","very","via","vs",
  62 +"want","was","we","week","well","were","what","whatever","whatsoever","when","whence",
  63 +"whenever","whensoever","where","whereabouts","whereafter","whereas","whereat",
  64 +"whereby","wherefore","wherefrom","wherein","whereinto","whereof","whereon",
  65 +"wheresoever","whereto","whereunto","whereupon","wherever","wherewith","whether",
  66 +"whew","which","whichever","whichsoever","while","whilst","whither","who","whoa",
  67 +"whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will",
  68 +"wilt","with","within","without","worse","worst","would","wow","ye","yet","year",
  69 +"yippee","you","your","yours","yourself","yourselves"
68 70 ]
69 71  
70 72 Transmap = {
... ... @@ -144,7 +146,7 @@
144 146 "\xCA\xBF" => "\x27", # c-single quote
145 147 "\xCC\xA8" => "", # modifier - under curve
146 148 "\xCC\xB1" => "", # modifier - under line
147   - /\W/ => ""
  149 +# /\W/ => ""
148 150 }
149 151  
150 152 end
... ... @@ -250,8 +252,8 @@
250 252  
251 253 def strip_with_pattern(pattern)
252 254 require 'cgi'
253   - require 'kconv'
254   - CGI::unescapeHTML(self.gsub(pattern,"")).toutf8
  255 +
  256 + CGI::unescapeHTML(self.gsub(pattern,"")).unaccent.encode("UTF-8", {:invalid => :replace, :undef => :replace, :replace => " "})
255 257 end
256 258  
257 259 private :strip_with_pattern