Commit b3c02139759e76987368a3d947c49f8c6641ce21

Authored by Romain Deveaud
1 parent e0e33fca06
Exists in master

faster computing of successive calls to ngrams(). is_stopword? is now effective …

…for multiword expressions. sdm fix.

Showing 3 changed files with 13 additions and 8 deletions Inline Diff

lib/mirimiri/document.rb
1 #!/usr/bin/env ruby 1 #!/usr/bin/env ruby
2 2
3 #-- 3 #--
4 # This file is a part of the mirimiri library 4 # This file is a part of the mirimiri library
5 # 5 #
6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> 6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7 # 7 #
8 # This program is free software: you can redistribute it and/or modify 8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by 9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or 10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version. 11 # (at your option) any later version.
12 # 12 #
13 # This program is distributed in the hope that it will be useful, 13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details. 16 # GNU General Public License for more details.
17 # 17 #
18 # You should have received a copy of the GNU General Public License 18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>. 19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #++ 20 #++
21 21
22 22
23 # General module 23 # General module
24 module Mirimiri 24 module Mirimiri
25 25
26 # A Document is a bag of words and is constructed from a string. 26 # A Document is a bag of words and is constructed from a string.
27 class Document 27 class Document
28 attr_reader :words, :doc_content, :count_words 28 attr_reader :words, :doc_content, :count_words
29 29
30 # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html 30 # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
31 # and the \\W special escape). 31 # and the \\W special escape).
32 # 32 #
33 # Protected function, only meant to by called at the initialization. 33 # Protected function, only meant to by called at the initialization.
34 def format_words 34 def format_words
35 wo = [] 35 wo = []
36 36
37 @doc_content.split.each do |w| 37 @doc_content.split.each do |w|
38 w.split(/\W/).each do |sw| 38 w.split(/\W/).each do |sw|
39 wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ 39 wo.push(sw.downcase) if sw =~ /[a-zA-Z]/
40 end 40 end
41 end 41 end
42 42
43 wo 43 wo
44 end 44 end
45 45
46 # Returns an Array containing the +n+-grams (words) from the current Document. 46 # Returns an Array containing the +n+-grams (words) from the current Document.
47 # 47 #
48 # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] 48 # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
49 def ngrams(n) 49 def ngrams(n)
50 window = [] 50 window = []
51 ngrams_array = [] 51 ngrams_array = []
52 52
53 @words.each do |w| 53 if @ngrams[n].nil?
54 window.push(w) 54 @words.each do |w|
55 if window.size == n 55 window.push(w)
56 ngrams_array.push window.join(" ") 56 if window.size == n
57 window.delete_at(0) 57 ngrams_array.push window.join(" ")
58 window.delete_at(0)
59 end
58 end 60 end
61 @ngrams[n] = ngrams_array
59 end 62 end
60 63
61 ngrams_array 64 @ngrams[n]
62 end 65 end
63 66
64 # Returns a Hash containing the words and their associated counts in the current Document. 67 # Returns a Hash containing the words and their associated counts in the current Document.
65 # 68 #
66 # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } 69 # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
67 def count_words 70 def count_words
68 counts = Hash.new { |h,k| h[k] = 0 } 71 counts = Hash.new { |h,k| h[k] = 0 }
69 @words.each { |w| counts[w] += 1 } 72 @words.each { |w| counts[w] += 1 }
70 73
71 counts 74 counts
72 end 75 end
73 76
74 # Old entropy function. 77 # Old entropy function.
75 # TODO: remove. 78 # TODO: remove.
76 def entropy0(s) 79 def entropy0(s)
77 en = 0.0 80 en = 0.0
78 81
79 s.split.each do |w| 82 s.split.each do |w|
80 p_wi = @count_words[w].to_f/@words.count.to_f 83 p_wi = @count_words[w].to_f/@words.count.to_f
81 en += p_wi*Math.log2(p_wi) 84 en += p_wi*Math.log2(p_wi)
82 end 85 end
83 86
84 en *= -1 87 en *= -1
85 en 88 en
86 end 89 end
87 90
88 # Computes the entropy of a given string +s+ inside the document. 91 # Computes the entropy of a given string +s+ inside the document.
89 # 92 #
90 # If the string parameter is composed of many words (i.e. tokens separated 93 # If the string parameter is composed of many words (i.e. tokens separated
91 # by whitespace(s)), it is considered as an ngram. 94 # by whitespace(s)), it is considered as an ngram.
92 # 95 #
93 # entropy("guitar") #=> 0.014348983965324762 96 # entropy("guitar") #=> 0.014348983965324762
94 # entropy("dillinger escape plan") #=> 0.054976093116768154 97 # entropy("dillinger escape plan") #=> 0.054976093116768154
95 def entropy(s) 98 def entropy(s)
96 en = 0.0 99 en = 0.0
97 100
98 size = s.split.size 101 size = s.split.size
99 102
100 if size == 1 103 if size == 1
101 p_wi = @count_words[s].to_f/@words.count.to_f 104 p_wi = @count_words[s].to_f/@words.count.to_f
102 en += p_wi*Math.log(p_wi) 105 en += p_wi*Math.log(p_wi)
103 elsif size > 1 106 elsif size > 1
104 ng_size = ngrams(size) 107 ng_size = ngrams(size)
105 p_wi = ng_size.count(s).to_f/ng_size.count.to_f 108 p_wi = ng_size.count(s).to_f/ng_size.count.to_f
106 en += p_wi*Math.log(p_wi) 109 en += p_wi*Math.log(p_wi)
107 end 110 end
108 111
109 en *= -1 112 en *= -1
110 en 113 en
111 end 114 end
112 115
113 # Computes the term frequency of a given *word* +s+. 116 # Computes the term frequency of a given *word* +s+.
114 # 117 #
115 # tf("guitar") #=> 0.000380372765310004 118 # tf("guitar") #=> 0.000380372765310004
116 def tf(s) 119 def tf(s)
117 @count_words[s].to_f/@words.size.to_f 120 @count_words[s].to_f/@words.size.to_f
118 end 121 end
119 122
120 123
121 def initialize(content="") 124 def initialize(content="")
122 @doc_content = content 125 @doc_content = content
123 @words = format_words 126 @words = format_words
124 @count_words = count_words 127 @count_words = count_words
128 @ngrams = {}
125 end 129 end
126 130
127 protected :format_words, :count_words 131 protected :format_words, :count_words
128 end 132 end
129 133
130 # A WebDocument is a Document with a +url+. 134 # A WebDocument is a Document with a +url+.
131 class WebDocument < Document 135 class WebDocument < Document
132 attr_reader :url 136 attr_reader :url
133 137
134 # Returns the HTML text from the page of a given +url+. 138 # Returns the HTML text from the page of a given +url+.
135 def self.get_content(url) 139 def self.get_content(url)
136 require 'net/http' 140 require 'net/http'
137 Net::HTTP.get(URI.parse(url)) 141 Net::HTTP.get(URI.parse(url))
138 end 142 end
139 143
140 144
141 # WebDocument constructor, the content of the Document is the HTML page 145 # WebDocument constructor, the content of the Document is the HTML page
142 # without the tags. 146 # without the tags.
143 def initialize(url,only_tags=nil) 147 def initialize(url,only_tags=nil)
144 require 'sanitize' 148 require 'sanitize'
145 149
146 @url = url 150 @url = url
147 content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("") 151 content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")
148 super Sanitize.clean(content.unaccent.toutf8.force_encoding("UTF-8"), :remove_contents => ['script']) 152 super Sanitize.clean(content.unaccent.toutf8.force_encoding("UTF-8"), :remove_contents => ['script'])
149 end 153 end
150 end 154 end
151 155
152 # A WikipediaPage is a WebDocument. 156 # A WikipediaPage is a WebDocument.
153 class WikipediaPage < WebDocument 157 class WikipediaPage < WebDocument
154 require 'rexml/document' 158 require 'rexml/document'
155 require 'net/http' 159 require 'net/http'
156 require 'kconv' 160 require 'kconv'
157 161
158 162
159 def self.search_wikipedia_titles(name) 163 def self.search_wikipedia_titles(name)
160 raise ArgumentError, "Bad encoding", name unless name.isutf8 164 raise ArgumentError, "Bad encoding", name unless name.isutf8
161 165
162 res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).unaccent.toutf8).elements['api/query/search'] 166 res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).unaccent.toutf8).elements['api/query/search']
163 167
164 res.collect { |e| e.attributes['title'] } unless res.nil? 168 res.collect { |e| e.attributes['title'] } unless res.nil?
165 end 169 end
166 170
167 def self.get_url(name) 171 def self.get_url(name)
168 raise ArgumentError, "Bad encoding", name unless name.isutf8 172 raise ArgumentError, "Bad encoding", name unless name.isutf8
169 173
170 atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).unaccent.toutf8).elements['api/query/pages/page'].attributes 174 atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).unaccent.toutf8).elements['api/query/pages/page'].attributes
171 175
172 atts['fullurl'] if atts['missing'].nil? 176 atts['fullurl'] if atts['missing'].nil?
173 end 177 end
174 178
175 def self.search_homepage(name) 179 def self.search_homepage(name)
176 title = WikipediaPage.search_wikipedia_titles name 180 title = WikipediaPage.search_wikipedia_titles name
177 181
178 WikipediaPage.get_url(title[0]) unless title.nil? || title.empty? 182 WikipediaPage.get_url(title[0]) unless title.nil? || title.empty?
179 end 183 end
180 184
181 def self.extract_anchors(url) 185 def self.extract_anchors(url)
182 self.get_content(url).extract_xmltags_values('p').join(' ').scan(/<a href="(.+?)" title=.*?>(.+?)<\/a>/).delete_if { |a| a[0] =~ /^\/wiki\/.*$/.negated } 186 self.get_content(url).extract_xmltags_values('p').join(' ').scan(/<a href="(.+?)" title=.*?>(.+?)<\/a>/).delete_if { |a| a[0] =~ /^\/wiki\/.*$/.negated }
183 end 187 end
184 end 188 end
185 189
186 class FreebasePage < WebDocument 190 class FreebasePage < WebDocument
187 require 'net/http' 191 require 'net/http'
188 require 'kconv' 192 require 'kconv'
189 require 'json' 193 require 'json'
190 194
191 def self.search_article_ids query,limit 195 def self.search_article_ids query,limit
192 raise ArgumentError, "Bad encoding", name unless name.isutf8 196 raise ArgumentError, "Bad encoding", name unless name.isutf8
193 197
194 JSON.parse(Net::HTTP.get( URI.parse "http://api.freebase.com/api/service/search?query=#{query.gsub(" ","+")}&limit=#{limit}" ))['result'].collect { |a| a['article']['id'] unless a['article'].nil? }.compact 198 JSON.parse(Net::HTTP.get( URI.parse "http://api.freebase.com/api/service/search?query=#{query.gsub(" ","+")}&limit=#{limit}" ))['result'].collect { |a| a['article']['id'] unless a['article'].nil? }.compact
195 end 199 end
196 200
197 def self.get_url id 201 def self.get_url id
198 "http://api.freebase.com/api/trans/raw#{id}" 202 "http://api.freebase.com/api/trans/raw#{id}"
199 end 203 end
200 end 204 end
201 end 205 end
202 206
lib/mirimiri/string.rb
1 #!/usr/bin/env ruby 1 #!/usr/bin/env ruby
2 2
3 #-- 3 #--
4 # This file is a part of the mirimiri library 4 # This file is a part of the mirimiri library
5 # 5 #
6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> 6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7 # 7 #
8 # This program is free software: you can redistribute it and/or modify 8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by 9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or 10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version. 11 # (at your option) any later version.
12 # 12 #
13 # This program is distributed in the hope that it will be useful, 13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details. 16 # GNU General Public License for more details.
17 # 17 #
18 # You should have received a copy of the GNU General Public License 18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>. 19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #++ 20 #++
21 21
22 module Mirimiri 22 module Mirimiri
23 23
24 # These are the default stopwords provided by Lemur. 24 # These are the default stopwords provided by Lemur.
25 Stoplist = [ 25 Stoplist = [
26 "a","about","above","according","across","after","afterwards","again","against", 26 "a","about","above","according","across","after","afterwards","again","against",
27 "albeit","all","almost","alone","along","already","also","although","always","am", 27 "albeit","all","almost","alone","along","already","also","although","always","am",
28 "among","amongst","an","and","another","any","anybody","anyhow","anyone","anything", 28 "among","amongst","an","and","another","any","anybody","anyhow","anyone","anything",
29 "anyway","anywhere","apart","are","around","as","at","av","be","became","because", 29 "anyway","anywhere","apart","are","around","as","at","av","be","became","because",
30 "become","becomes","becoming","been","before","beforehand","behind","being","below", 30 "become","becomes","becoming","been","before","beforehand","behind","being","below",
31 "beside","besides","between","beyond","both","but","by","can","cannot","canst", 31 "beside","besides","between","beyond","both","but","by","can","cannot","canst",
32 "certain","cf","choose","contrariwise","cos","could","cu","day","do","does","doesn't", 32 "certain","cf","choose","contrariwise","cos","could","cu","day","do","does","doesn't",
33 "doing","dost","doth","double","down","dual","during","each","either","else", 33 "doing","dost","doth","double","down","dual","during","each","either","else",
34 "elsewhere","enough","et","etc","even","ever","every","everybody","everyone", 34 "elsewhere","enough","et","etc","even","ever","every","everybody","everyone",
35 "everything","everywhere","except","excepted","excepting","exception","exclude", 35 "everything","everywhere","except","excepted","excepting","exception","exclude",
36 "excluding","exclusive","far","farther","farthest","few","ff","first","for", 36 "excluding","exclusive","far","farther","farthest","few","ff","first","for",
37 "formerly","forth","forward","from","front","further","furthermore","furthest","get", 37 "formerly","forth","forward","from","front","further","furthermore","furthest","get",
38 "go","had","halves","hardly","has","hast","hath","have","he","hence","henceforth", 38 "go","had","halves","hardly","has","hast","hath","have","he","hence","henceforth",
39 "her","here","hereabouts","hereafter","hereby","herein","hereto","hereupon","hers", 39 "her","here","hereabouts","hereafter","hereby","herein","hereto","hereupon","hers",
40 "herself","him","himself","hindmost","his","hither","hitherto","how","however", 40 "herself","him","himself","hindmost","his","hither","hitherto","how","however",
41 "howsoever","i","ie","if","in","inasmuch","inc","include","included","including", 41 "howsoever","i","ie","if","in","inasmuch","inc","include","included","including",
42 "indeed","indoors","inside","insomuch","instead","into","inward","inwards","is", 42 "indeed","indoors","inside","insomuch","instead","into","inward","inwards","is",
43 "it","its","itself","just","kind","kg","km","last","latter","latterly","less","lest", 43 "it","its","itself","just","kind","kg","km","last","latter","latterly","less","lest",
44 "let","like","little","ltd","many","may","maybe","me","meantime","meanwhile","might", 44 "let","like","little","ltd","many","may","maybe","me","meantime","meanwhile","might",
45 "moreover","most","mostly","more","mr","mrs","ms","much","must","my","myself", 45 "moreover","most","mostly","more","mr","mrs","ms","much","must","my","myself",
46 "namely","need","neither","never","nevertheless","next","no","nobody","none", 46 "namely","need","neither","never","nevertheless","next","no","nobody","none",
47 "nonetheless","noone","nope","nor","not","nothing","notwithstanding","now","nowadays", 47 "nonetheless","noone","nope","nor","not","nothing","notwithstanding","now","nowadays",
48 "nowhere","of","off","often","ok","on","once","one","only","onto","or","other", 48 "nowhere","of","off","often","ok","on","once","one","only","onto","or","other",
49 "others","otherwise","ought","our","ours","ourselves","out","outside","over","own", 49 "others","otherwise","ought","our","ours","ourselves","out","outside","over","own",
50 "per","perhaps","plenty","provide","quite","rather","really","round","said","sake", 50 "per","perhaps","plenty","provide","quite","rather","really","round","said","sake",
51 "same","sang","save","saw","see","seeing","seem","seemed","seeming","seems","seen", 51 "same","sang","save","saw","see","seeing","seem","seemed","seeming","seems","seen",
52 "seldom","selves","sent","several","shalt","she","should","shown","sideways","since", 52 "seldom","selves","sent","several","shalt","she","should","shown","sideways","since",
53 "slept","slew","slung","slunk","smote","so","some","somebody","somehow","someone", 53 "slept","slew","slung","slunk","smote","so","some","somebody","somehow","someone",
54 "something","sometime","sometimes","somewhat","somewhere","spake","spat","spoke", 54 "something","sometime","sometimes","somewhat","somewhere","spake","spat","spoke",
55 "spoken","sprang","sprung","stave","staves","still","such","supposing","than","that", 55 "spoken","sprang","sprung","stave","staves","still","such","supposing","than","that",
56 "the","thee","their","them","themselves","then","thence","thenceforth","there", 56 "the","thee","their","them","themselves","then","thence","thenceforth","there",
57 "thereabout","thereabouts","thereafter","thereby","therefore","therein","thereof", 57 "thereabout","thereabouts","thereafter","thereby","therefore","therein","thereof",
58 "thereon","thereto","thereupon","these","they","this","those","thou","though", 58 "thereon","thereto","thereupon","these","they","this","those","thou","though",
59 "thrice","through","throughout","thru","thus","thy","thyself","till","to","together", 59 "thrice","through","throughout","thru","thus","thy","thyself","till","to","together",
60 "too","toward","towards","ugh","unable","under","underneath","unless","unlike", 60 "too","toward","towards","ugh","unable","under","underneath","unless","unlike",
61 "until","up","upon","upward","upwards","us","use","used","using","very","via","vs", 61 "until","up","upon","upward","upwards","us","use","used","using","very","via","vs",
62 "want","was","we","week","well","were","what","whatever","whatsoever","when","whence", 62 "want","was","we","week","well","were","what","whatever","whatsoever","when","whence",
63 "whenever","whensoever","where","whereabouts","whereafter","whereas","whereat", 63 "whenever","whensoever","where","whereabouts","whereafter","whereas","whereat",
64 "whereby","wherefore","wherefrom","wherein","whereinto","whereof","whereon", 64 "whereby","wherefore","wherefrom","wherein","whereinto","whereof","whereon",
65 "wheresoever","whereto","whereunto","whereupon","wherever","wherewith","whether", 65 "wheresoever","whereto","whereunto","whereupon","wherever","wherewith","whether",
66 "whew","which","whichever","whichsoever","while","whilst","whither","who","whoa", 66 "whew","which","whichever","whichsoever","while","whilst","whither","who","whoa",
67 "whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will", 67 "whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will",
68 "wilt","with","within","without","worse","worst","would","wow","ye","yet","year", 68 "wilt","with","within","without","worse","worst","would","wow","ye","yet","year",
69 "yippee","you","your","yours","yourself","yourselves", 69 "yippee","you","your","yours","yourself","yourselves",
70 "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en","html" 70 "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en","html"
71 ] 71 ]
72 72
73 Transmap = { 73 Transmap = {
74 "\xC3\x80" => "A", "\xC3\x81" => "A", "\xC3\x82" => "A", "\xC3\x83" => "A", 74 "\xC3\x80" => "A", "\xC3\x81" => "A", "\xC3\x82" => "A", "\xC3\x83" => "A",
75 "\xC3\x84" => "A", "\xC3\x85" => "A", "\xC3\x86" => "AE","\xC3\x87" => "C", 75 "\xC3\x84" => "A", "\xC3\x85" => "A", "\xC3\x86" => "AE","\xC3\x87" => "C",
76 "\xC3\x88" => "E", "\xC3\x89" => "E", "\xC3\x8A" => "E", "\xC3\x8B" => "E", 76 "\xC3\x88" => "E", "\xC3\x89" => "E", "\xC3\x8A" => "E", "\xC3\x8B" => "E",
77 "\xC3\x8C" => "I", "\xC3\x8D" => "I", "\xC3\x8E" => "I", "\xC3\x8F" => "I", 77 "\xC3\x8C" => "I", "\xC3\x8D" => "I", "\xC3\x8E" => "I", "\xC3\x8F" => "I",
78 "\xC3\x90" => "D", "\xC3\x91" => "N", "\xC3\x92" => "O", "\xC3\x93" => "O", 78 "\xC3\x90" => "D", "\xC3\x91" => "N", "\xC3\x92" => "O", "\xC3\x93" => "O",
79 "\xC3\x94" => "O", "\xC3\x95" => "O", "\xC3\x96" => "O", "\xC3\x98" => "O", 79 "\xC3\x94" => "O", "\xC3\x95" => "O", "\xC3\x96" => "O", "\xC3\x98" => "O",
80 "\xC3\x99" => "U", "\xC3\x9A" => "U", "\xC3\x9B" => "U", "\xC3\x9C" => "U", 80 "\xC3\x99" => "U", "\xC3\x9A" => "U", "\xC3\x9B" => "U", "\xC3\x9C" => "U",
81 "\xC3\x9D" => "Y", "\xC3\x9E" => "P", "\xC3\x9F" => "ss", 81 "\xC3\x9D" => "Y", "\xC3\x9E" => "P", "\xC3\x9F" => "ss",
82 "\xC3\xA0" => "a", "\xC3\xA1" => "a", "\xC3\xA2" => "a", "\xC3\xA3" => "a", 82 "\xC3\xA0" => "a", "\xC3\xA1" => "a", "\xC3\xA2" => "a", "\xC3\xA3" => "a",
83 "\xC3\xA4" => "a", "\xC3\xA5" => "a", "\xC3\xA6" => "ae","\xC3\xA7" => "c", 83 "\xC3\xA4" => "a", "\xC3\xA5" => "a", "\xC3\xA6" => "ae","\xC3\xA7" => "c",
84 "\xC3\xA8" => "e", "\xC3\xA9" => "e", "\xC3\xAA" => "e", "\xC3\xAB" => "e", 84 "\xC3\xA8" => "e", "\xC3\xA9" => "e", "\xC3\xAA" => "e", "\xC3\xAB" => "e",
85 "\xC3\xAC" => "i", "\xC3\xAD" => "i", "\xC3\xAE" => "i", "\xC3\xAF" => "i", 85 "\xC3\xAC" => "i", "\xC3\xAD" => "i", "\xC3\xAE" => "i", "\xC3\xAF" => "i",
86 "\xC3\xB0" => "o", "\xC3\xB1" => "n", "\xC3\xB2" => "o", "\xC3\xB3" => "o", 86 "\xC3\xB0" => "o", "\xC3\xB1" => "n", "\xC3\xB2" => "o", "\xC3\xB3" => "o",
87 "\xC3\xB4" => "o", "\xC3\xB5" => "o", "\xC3\xB6" => "o", "\xC3\xB8" => "o", 87 "\xC3\xB4" => "o", "\xC3\xB5" => "o", "\xC3\xB6" => "o", "\xC3\xB8" => "o",
88 "\xC3\xB9" => "u", "\xC3\xBA" => "u", "\xC3\xBB" => "u", "\xC3\xBC" => "u", 88 "\xC3\xB9" => "u", "\xC3\xBA" => "u", "\xC3\xBB" => "u", "\xC3\xBC" => "u",
89 "\xC3\xBD" => "y", "\xC3\xBE" => "p", "\xC3\xBF" => "y", 89 "\xC3\xBD" => "y", "\xC3\xBE" => "p", "\xC3\xBF" => "y",
90 "\xC4\x80" => "A", "\xC4\x81" => "a", "\xC4\x82" => "A", "\xC4\x83" => "a", 90 "\xC4\x80" => "A", "\xC4\x81" => "a", "\xC4\x82" => "A", "\xC4\x83" => "a",
91 "\xC4\x84" => "A", "\xC4\x85" => "a", "\xC4\x86" => "C", "\xC4\x87" => "c", 91 "\xC4\x84" => "A", "\xC4\x85" => "a", "\xC4\x86" => "C", "\xC4\x87" => "c",
92 "\xC4\x88" => "C", "\xC4\x89" => "c", "\xC4\x8A" => "C", "\xC4\x8B" => "c", 92 "\xC4\x88" => "C", "\xC4\x89" => "c", "\xC4\x8A" => "C", "\xC4\x8B" => "c",
93 "\xC4\x8C" => "C", "\xC4\x8D" => "c", "\xC4\x8E" => "D", "\xC4\x8F" => "d", 93 "\xC4\x8C" => "C", "\xC4\x8D" => "c", "\xC4\x8E" => "D", "\xC4\x8F" => "d",
94 "\xC4\x90" => "D", "\xC4\x91" => "d", "\xC4\x92" => "E", "\xC4\x93" => "e", 94 "\xC4\x90" => "D", "\xC4\x91" => "d", "\xC4\x92" => "E", "\xC4\x93" => "e",
95 "\xC4\x94" => "E", "\xC4\x95" => "e", "\xC4\x96" => "E", "\xC4\x97" => "e", 95 "\xC4\x94" => "E", "\xC4\x95" => "e", "\xC4\x96" => "E", "\xC4\x97" => "e",
96 "\xC4\x98" => "E", "\xC4\x99" => "e", "\xC4\x9A" => "E", "\xC4\x9B" => "e", 96 "\xC4\x98" => "E", "\xC4\x99" => "e", "\xC4\x9A" => "E", "\xC4\x9B" => "e",
97 "\xC4\x9C" => "G", "\xC4\x9D" => "g", "\xC4\x9E" => "G", "\xC4\x9F" => "g", 97 "\xC4\x9C" => "G", "\xC4\x9D" => "g", "\xC4\x9E" => "G", "\xC4\x9F" => "g",
98 "\xC4\xA0" => "G", "\xC4\xA1" => "g", "\xC4\xA2" => "G", "\xC4\xA3" => "g", 98 "\xC4\xA0" => "G", "\xC4\xA1" => "g", "\xC4\xA2" => "G", "\xC4\xA3" => "g",
99 "\xC4\xA4" => "H", "\xC4\xA5" => "h", "\xC4\xA6" => "H", "\xC4\xA7" => "h", 99 "\xC4\xA4" => "H", "\xC4\xA5" => "h", "\xC4\xA6" => "H", "\xC4\xA7" => "h",
100 "\xC4\xA8" => "I", "\xC4\xA9" => "i", "\xC4\xAA" => "I", "\xC4\xAB" => "i", 100 "\xC4\xA8" => "I", "\xC4\xA9" => "i", "\xC4\xAA" => "I", "\xC4\xAB" => "i",
101 "\xC4\xAC" => "I", "\xC4\xAD" => "i", "\xC4\xAE" => "I", "\xC4\xAF" => "i", 101 "\xC4\xAC" => "I", "\xC4\xAD" => "i", "\xC4\xAE" => "I", "\xC4\xAF" => "i",
102 "\xC4\xB0" => "I", "\xC4\xB1" => "i", "\xC4\xB2" => "IJ","\xC4\xB3" => "ij", 102 "\xC4\xB0" => "I", "\xC4\xB1" => "i", "\xC4\xB2" => "IJ","\xC4\xB3" => "ij",
103 "\xC4\xB4" => "J", "\xC4\xB5" => "j", "\xC4\xB6" => "K", "\xC4\xB7" => "k", 103 "\xC4\xB4" => "J", "\xC4\xB5" => "j", "\xC4\xB6" => "K", "\xC4\xB7" => "k",
104 "\xC4\xB8" => "k", "\xC4\xB9" => "L", "\xC4\xBA" => "l", "\xC4\xBB" => "L", 104 "\xC4\xB8" => "k", "\xC4\xB9" => "L", "\xC4\xBA" => "l", "\xC4\xBB" => "L",
105 "\xC4\xBC" => "l", "\xC4\xBD" => "L", "\xC4\xBE" => "l", "\xC4\xBF" => "L", 105 "\xC4\xBC" => "l", "\xC4\xBD" => "L", "\xC4\xBE" => "l", "\xC4\xBF" => "L",
106 "\xC5\x80" => "l", "\xC5\x81" => "L", "\xC5\x82" => "l", "\xC5\x83" => "N", 106 "\xC5\x80" => "l", "\xC5\x81" => "L", "\xC5\x82" => "l", "\xC5\x83" => "N",
107 "\xC5\x84" => "n", "\xC5\x85" => "N", "\xC5\x86" => "n", "\xC5\x87" => "N", 107 "\xC5\x84" => "n", "\xC5\x85" => "N", "\xC5\x86" => "n", "\xC5\x87" => "N",
108 "\xC5\x88" => "n", "\xC5\x89" => "n", "\xC5\x8A" => "N", "\xC5\x8B" => "n", 108 "\xC5\x88" => "n", "\xC5\x89" => "n", "\xC5\x8A" => "N", "\xC5\x8B" => "n",
109 "\xC5\x8C" => "O", "\xC5\x8D" => "o", "\xC5\x8E" => "O", "\xC5\x8F" => "o", 109 "\xC5\x8C" => "O", "\xC5\x8D" => "o", "\xC5\x8E" => "O", "\xC5\x8F" => "o",
110 "\xC5\x90" => "O", "\xC5\x91" => "o", "\xC5\x92" => "CE","\xC5\x93" => "ce", 110 "\xC5\x90" => "O", "\xC5\x91" => "o", "\xC5\x92" => "CE","\xC5\x93" => "ce",
111 "\xC5\x94" => "R", "\xC5\x95" => "r", "\xC5\x96" => "R", "\xC5\x97" => "r", 111 "\xC5\x94" => "R", "\xC5\x95" => "r", "\xC5\x96" => "R", "\xC5\x97" => "r",
112 "\xC5\x98" => "R", "\xC5\x99" => "r", "\xC5\x9A" => "S", "\xC5\x9B" => "s", 112 "\xC5\x98" => "R", "\xC5\x99" => "r", "\xC5\x9A" => "S", "\xC5\x9B" => "s",
113 "\xC5\x9C" => "S", "\xC5\x9D" => "s", "\xC5\x9E" => "S", "\xC5\x9F" => "s", 113 "\xC5\x9C" => "S", "\xC5\x9D" => "s", "\xC5\x9E" => "S", "\xC5\x9F" => "s",
114 "\xC5\xA0" => "S", "\xC5\xA1" => "s", "\xC5\xA2" => "T", "\xC5\xA3" => "t", 114 "\xC5\xA0" => "S", "\xC5\xA1" => "s", "\xC5\xA2" => "T", "\xC5\xA3" => "t",
115 "\xC5\xA4" => "T", "\xC5\xA5" => "t", "\xC5\xA6" => "T", "\xC5\xA7" => "t", 115 "\xC5\xA4" => "T", "\xC5\xA5" => "t", "\xC5\xA6" => "T", "\xC5\xA7" => "t",
116 "\xC5\xA8" => "U", "\xC5\xA9" => "u", "\xC5\xAA" => "U", "\xC5\xAB" => "u", 116 "\xC5\xA8" => "U", "\xC5\xA9" => "u", "\xC5\xAA" => "U", "\xC5\xAB" => "u",
117 "\xC5\xAC" => "U", "\xC5\xAD" => "u", "\xC5\xAE" => "U", "\xC5\xAF" => "u", 117 "\xC5\xAC" => "U", "\xC5\xAD" => "u", "\xC5\xAE" => "U", "\xC5\xAF" => "u",
118 "\xC5\xB0" => "U", "\xC5\xB1" => "u", "\xC5\xB2" => "U", "\xC5\xB3" => "u", 118 "\xC5\xB0" => "U", "\xC5\xB1" => "u", "\xC5\xB2" => "U", "\xC5\xB3" => "u",
119 "\xC5\xB4" => "W", "\xC5\xB5" => "w", "\xC5\xB6" => "Y", "\xC5\xB7" => "y", 119 "\xC5\xB4" => "W", "\xC5\xB5" => "w", "\xC5\xB6" => "Y", "\xC5\xB7" => "y",
120 "\xC5\xB8" => "Y", "\xC5\xB9" => "Z", "\xC5\xBA" => "z", "\xC5\xBB" => "Z", 120 "\xC5\xB8" => "Y", "\xC5\xB9" => "Z", "\xC5\xBA" => "z", "\xC5\xBB" => "Z",
121 "\xC5\xBC" => "z", "\xC5\xBD" => "Z", "\xC5\xBE" => "z", "\xC6\x8F" => "E", 121 "\xC5\xBC" => "z", "\xC5\xBD" => "Z", "\xC5\xBE" => "z", "\xC6\x8F" => "E",
122 "\xC6\xA0" => "O", "\xC6\xA1" => "o", "\xC6\xAF" => "U", "\xC6\xB0" => "u", 122 "\xC6\xA0" => "O", "\xC6\xA1" => "o", "\xC6\xAF" => "U", "\xC6\xB0" => "u",
123 "\xC7\x8D" => "A", "\xC7\x8E" => "a", "\xC7\x8F" => "I", 123 "\xC7\x8D" => "A", "\xC7\x8E" => "a", "\xC7\x8F" => "I",
124 "\xC7\x90" => "i", "\xC7\x91" => "O", "\xC7\x92" => "o", "\xC7\x93" => "U", 124 "\xC7\x90" => "i", "\xC7\x91" => "O", "\xC7\x92" => "o", "\xC7\x93" => "U",
125 "\xC7\x94" => "u", "\xC7\x95" => "U", "\xC7\x96" => "u", "\xC7\x97" => "U", 125 "\xC7\x94" => "u", "\xC7\x95" => "U", "\xC7\x96" => "u", "\xC7\x97" => "U",
126 "\xC7\x98" => "u", "\xC7\x99" => "U", "\xC7\x9A" => "u", "\xC7\x9B" => "U", 126 "\xC7\x98" => "u", "\xC7\x99" => "U", "\xC7\x9A" => "u", "\xC7\x9B" => "U",
127 "\xC7\x9C" => "u", 127 "\xC7\x9C" => "u",
128 "\xC7\xBA" => "A", "\xC7\xBB" => "a", "\xC7\xBC" => "AE","\xC7\xBD" => "ae", 128 "\xC7\xBA" => "A", "\xC7\xBB" => "a", "\xC7\xBC" => "AE","\xC7\xBD" => "ae",
129 "\xC7\xBE" => "O", "\xC7\xBF" => "o", 129 "\xC7\xBE" => "O", "\xC7\xBF" => "o",
130 "\xC9\x99" => "e", 130 "\xC9\x99" => "e",
131 "\xC2\x82" => ",", # High code comma 131 "\xC2\x82" => ",", # High code comma
132 "\xC2\x84" => ",,", # High code double comma 132 "\xC2\x84" => ",,", # High code double comma
133 "\xC2\x85" => "...", # Tripple dot 133 "\xC2\x85" => "...", # Tripple dot
134 "\xC2\x88" => "^", # High carat 134 "\xC2\x88" => "^", # High carat
135 "\xC2\x91" => "\x27", # Forward single quote 135 "\xC2\x91" => "\x27", # Forward single quote
136 "\xC2\x92" => "\x27", # Reverse single quote 136 "\xC2\x92" => "\x27", # Reverse single quote
137 "\xC2\x93" => "\x22", # Forward double quote 137 "\xC2\x93" => "\x22", # Forward double quote
138 "\xC2\x94" => "\x22", # Reverse double quote 138 "\xC2\x94" => "\x22", # Reverse double quote
139 "\xC2\x96" => "-", # High hyphen 139 "\xC2\x96" => "-", # High hyphen
140 "\xC2\x97" => "--", # Double hyphen 140 "\xC2\x97" => "--", # Double hyphen
141 "\xC2\xA6" => "|", # Split vertical bar 141 "\xC2\xA6" => "|", # Split vertical bar
142 "\xC2\xAB" => "<<", # Double less than 142 "\xC2\xAB" => "<<", # Double less than
143 "\xC2\xBB" => ">>", # Double greater than 143 "\xC2\xBB" => ">>", # Double greater than
144 "\xC2\xBC" => "1/4", # one quarter 144 "\xC2\xBC" => "1/4", # one quarter
145 "\xC2\xBD" => "1/2", # one half 145 "\xC2\xBD" => "1/2", # one half
146 "\xC2\xBE" => "3/4", # three quarters 146 "\xC2\xBE" => "3/4", # three quarters
147 "\xCA\xBF" => "\x27", # c-single quote 147 "\xCA\xBF" => "\x27", # c-single quote
148 "\xCC\xA8" => "", # modifier - under curve 148 "\xCC\xA8" => "", # modifier - under curve
149 "\xCC\xB1" => "", # modifier - under line 149 "\xCC\xB1" => "", # modifier - under line
150 # /\W/ => "" 150 # /\W/ => ""
151 } 151 }
152 152
153 end 153 end
154 154
155 # Extention of the standard class String with useful function. 155 # Extention of the standard class String with useful function.
156 class String 156 class String
157 include Mirimiri 157 include Mirimiri
158 158
159 def unaccent 159 def unaccent
160 # force_encoding is needed with ruby1.9 160 # force_encoding is needed with ruby1.9
161 Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) } 161 Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) }
162 end 162 end
163 163
164 # Returns +true+ if +self+ belongs to Mirimiri::Stoplist, +false+ otherwise. 164 # Returns +true+ if +self+ belongs to Mirimiri::Stoplist, +false+ otherwise.
165 def is_stopword? 165 def is_stopword?
166 Stoplist.include?(self.downcase) 166 self.split.all? { |e| Stoplist.include?(e.downcase) }
167 end 167 end
168 168
169 def sequential_dependence_model t=0.85,o=0.10,u=0.05,field=nil 169 def sequential_dependence_model t=0.85,o=0.10,u=0.05,field=nil
170 d = Mirimiri::Document.new self 170 d = Mirimiri::Document.new self
171 171
172 if field.nil? 172 if field.nil?
173 ematch = d.ngrams(2).collect { |ng| "#1(#{ng})" } 173 ematch = d.ngrams(2).collect { |ng| "#1(#{ng})" }
174 pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng})" } 174 pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng})" }
175 else 175 else
176 ematch = d.ngrams(2).collect { |ng| "#1(#{ng}).(#{field})" } 176 ematch = d.ngrams(2).collect { |ng| "#1(#{ng}).(#{field})" }
177 pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng}).(#{field})" } 177 pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng}).(#{field})" }
178 end 178 end
179 179
180 if ematch.empty? 180 if ematch.empty?
181 if field.nil? 181 if field.nil?
182 ematch = d.words.collect { |ng| "#1(#{ng})" } 182 ematch = d.words.collect { |ng| "#1(#{ng})" }
183 pmatch = d.words.collect { |ng| "#uw8(#{ng})" } 183 pmatch = d.words.collect { |ng| "#uw8(#{ng})" }
184 else 184 else
185 ematch = d.words.collect { |ng| "#1(#{ng}).(#{field})" } 185 ematch = d.words.collect { |ng| "#1(#{ng}).(#{field})" }
186 pmatch = d.words.collect { |ng| "#uw8(#{ng}).(#{field})" } 186 pmatch = d.words.collect { |ng| "#uw8(#{ng}).(#{field})" }
187 end 187 end
188 end 188 end
189 189
190 "#weight ( #{t} #combine( #{self} ) #{o} #combine ( #{ematch.join(" ")} ) #{u} #combine ( #{pmatch.join(" ")} ) )" 190 "#weight ( #{t} #combine( #{d.words.join(" ")} ) #{o} #combine ( #{ematch.join(" ")} ) #{u} #combine ( #{pmatch.join(" ")} ) )"
191 end 191 end
192 192
193 # Do not use. 193 # Do not use.
194 # TODO: rewamp. find why this function is here. 194 # TODO: rewamp. find why this function is here.
195 def remove_special_characters 195 def remove_special_characters
196 self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ') 196 self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ')
197 end 197 end
198 198
199 # Removes all XML-like tags from +self+. 199 # Removes all XML-like tags from +self+.
200 # 200 #
201 # s = "<html><body>test</body></html>" 201 # s = "<html><body>test</body></html>"
202 # s.strip_xml_tags! 202 # s.strip_xml_tags!
203 # s #=> "test" 203 # s #=> "test"
204 def strip_xml_tags! 204 def strip_xml_tags!
205 replace strip_with_pattern /<\/?[^>]*>/ 205 replace strip_with_pattern /<\/?[^>]*>/
206 end 206 end
207 207
208 # Removes all XML-like tags from +self+. 208 # Removes all XML-like tags from +self+.
209 # 209 #
210 # s = "<html><body>test</body></html>" 210 # s = "<html><body>test</body></html>"
211 # s.strip_xml_tags #=> "test" 211 # s.strip_xml_tags #=> "test"
212 # s #=> "<html><body>test</body></html>" 212 # s #=> "<html><body>test</body></html>"
213 def strip_xml_tags 213 def strip_xml_tags
214 dup.strip_xml_tags! 214 dup.strip_xml_tags!
215 end 215 end
216 216
217 # Removes all Javascript sources from +self+. 217 # Removes all Javascript sources from +self+.
218 # 218 #
219 # s = "<script type='text/javascript'> 219 # s = "<script type='text/javascript'>
220 # var skin='vector', 220 # var skin='vector',
221 # stylepath='http://bits.wikimedia.org/skins-1.5' 221 # stylepath='http://bits.wikimedia.org/skins-1.5'
222 # </script> 222 # </script>
223 # 223 #
224 # test" 224 # test"
225 # s.strip_javascripts! 225 # s.strip_javascripts!
226 # s #=> "test" 226 # s #=> "test"
227 def strip_javascripts! 227 def strip_javascripts!
228 replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m 228 replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m
229 end 229 end
230 230
231 # Removes all Javascript sources from +self+. 231 # Removes all Javascript sources from +self+.
232 # 232 #
233 # s = "<script type='text/javascript'> 233 # s = "<script type='text/javascript'>
234 # var skin='vector', 234 # var skin='vector',
235 # stylepath='http://bits.wikimedia.org/skins-1.5' 235 # stylepath='http://bits.wikimedia.org/skins-1.5'
236 # </script> 236 # </script>
237 # 237 #
238 # test" 238 # test"
239 # s.strip_javascripts #=> "test" 239 # s.strip_javascripts #=> "test"
240 def strip_javascripts 240 def strip_javascripts
241 dup.strip_javascripts! 241 dup.strip_javascripts!
242 end 242 end
243 243
244 def strip_stylesheets! 244 def strip_stylesheets!
245 # TODO: rewamp. dunno what is it. 245 # TODO: rewamp. dunno what is it.
246 replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m 246 replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m
247 end 247 end
248 248
249 def strip_stylesheets 249 def strip_stylesheets
250 dup.strip_stylesheets! 250 dup.strip_stylesheets!
251 end 251 end
252 252
253 # Removes punctuation from +self+. 253 # Removes punctuation from +self+.
254 # 254 #
255 # s = "hello, world. how are you?!" 255 # s = "hello, world. how are you?!"
256 # s.strip_punctuation! 256 # s.strip_punctuation!
257 # s # => "hello world how are you" 257 # s # => "hello world how are you"
258 def strip_punctuation! 258 def strip_punctuation!
259 replace strip_with_pattern /[^a-zA-Z0-9\-\s]/ 259 replace strip_with_pattern /[^a-zA-Z0-9\-\s]/
260 end 260 end
261 261
262 # Removes punctuation from +self+. 262 # Removes punctuation from +self+.
263 # 263 #
264 # s = "hello, world. how are you?!" 264 # s = "hello, world. how are you?!"
265 # s.strip_punctuation # => "hello world how are you" 265 # s.strip_punctuation # => "hello world how are you"
266 def strip_punctuation 266 def strip_punctuation
267 dup.strip_punctuation! 267 dup.strip_punctuation!
268 end 268 end
269 269
270 # Returns the text values inside all occurences of a XML tag in +self+ 270 # Returns the text values inside all occurences of a XML tag in +self+
271 # 271 #
272 # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre" 272 # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre"
273 # s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"] 273 # s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"]
274 def extract_xmltags_values(tag_name) 274 def extract_xmltags_values(tag_name)
275 self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten 275 self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten
276 end 276 end
277 277
278 def strip_with_pattern(pattern) 278 def strip_with_pattern(pattern)
279 require 'cgi' 279 require 'cgi'
280 280
281 CGI::unescapeHTML(self.gsub(pattern,"")).unaccent.encode("UTF-8", {:invalid => :replace, :undef => :replace, :replace => " "}) 281 CGI::unescapeHTML(self.gsub(pattern,"")).unaccent.encode("UTF-8", {:invalid => :replace, :undef => :replace, :replace => " "})
282 end 282 end
283 283
284 private :strip_with_pattern 284 private :strip_with_pattern
285 end 285 end
286 286
287 module Indri 287 module Indri
288 class IndriPrintedDocuments < String 288 class IndriPrintedDocuments < String
289 289
290 def extract_docs 290 def extract_docs
291 self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? } 291 self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? }
292 end 292 end
293 end 293 end
294 end 294 end
295 295
1 $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib")) 1 $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib"))
2 2
3 require 'mirimiri' 3 require 'mirimiri'
4 require "benchmark"
4 5
5 w = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan") 6 w = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan")
6 p w.entropy("dillinger escape plan") 7 p w.entropy("dillinger escape plan")
7 p w.tf("guitar") 8 p w.tf("guitar")
8 9
9 query = Indri::IndriQuery.new({:query => "dillinger escape plan".sequential_dependence_model, :count => 10}, "-trecFormat=true -printDocuments=true") 10 query = Indri::IndriQuery.new({:query => "dillinger escape plan".sequential_dependence_model, :count => 10}, "-trecFormat=true -printDocuments=true")
10 index = Indri::IndriIndex.new "/mnt/disk1/ClueWeb09_English_1noSpam" 11 index = Indri::IndriIndex.new "/mnt/disk1/ClueWeb09_English_1noSpam"
11 s = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8")) 12 s = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))
12 13