Commit b0ffa2ad49638e2a223fff528de1a4ad336acb72
1 parent
b3c0213975
Exists in
master
finally committing some recent changes
Showing 7 changed files with 146 additions and 16 deletions Inline Diff
README.markdown
1 | # mirimiri | 1 | # mirimiri |
2 | 2 | ||
3 | Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | 3 | The various tools of this project were developed for research purposes during |
4 | my Ph.D. and heavily rely on the use of Indri (<http://lemurproject.org/indri.php>). | ||
5 | Setting up Ruby is not as painful as it used to be since RVM (<https://rvm.io/>), | ||
6 | visit at least these two websites before trying to use `mirimiri`. | ||
7 | |||
8 | |||
9 | Copyright (C) 2010-2013 Romain Deveaud <romain.deveaud@gmail.com> | ||
4 | 10 | ||
5 | > The Fijian monkey-faced bat (Mirimiri acrodonta), also called the Fiji | 11 | > The Fijian monkey-faced bat (Mirimiri acrodonta), also called the Fiji |
6 | > Flying Fox, is an Old World fruit bat endemic to Fiji. It was discovered | 12 | > Flying Fox, is an Old World fruit bat endemic to Fiji. It was discovered |
7 | > the hills of Taveuni by Bill Beckon in 1977 and is Fiji's only endemic | 13 | > the hills of Taveuni by Bill Beckon in 1977 and is Fiji's only endemic |
8 | > mammal. It is listed as a critically endangered species due to habitat | 14 | > mammal. It is listed as a critically endangered species due to habitat |
9 | > loss. It has recently been transferred from Pteralopex to its own | 15 | > loss. It has recently been transferred from Pteralopex to its own |
10 | > monotypic genus Mirimiri. | 16 | > monotypic genus Mirimiri. |
11 | > | 17 | > |
12 | > #####Wikipedia | 18 | > #####Wikipedia |
13 | 19 | ||
14 | License | 20 | License |
15 | ======= | 21 | ======= |
16 | 22 | ||
17 | This program is free software: you can redistribute it and/or modify | 23 | This program is free software: you can redistribute it and/or modify |
18 | it under the terms of the GNU General Public License as published by | 24 | it under the terms of the GNU General Public License as published by |
19 | the Free Software Foundation, either version 3 of the License, or | 25 | the Free Software Foundation, either version 3 of the License, or |
20 | (at your option) any later version. | 26 | (at your option) any later version. |
21 | 27 | ||
22 | This program is distributed in the hope that it will be useful, | 28 | This program is distributed in the hope that it will be useful, |
23 | but WITHOUT ANY WARRANTY; without even the implied warranty of | 29 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
24 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 30 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
25 | GNU General Public License for more details. | 31 | GNU General Public License for more details. |
26 | 32 | ||
27 | You should have received a copy of the GNU General Public License | 33 | You should have received a copy of the GNU General Public License |
28 | along with this program. If not, see <http://www.gnu.org/licenses/>. | 34 | along with this program. If not, see <http://www.gnu.org/licenses/>. |
29 | 35 |
lib/mirimiri/document.rb
1 | #!/usr/bin/env ruby | 1 | #!/usr/bin/env ruby |
2 | 2 | ||
3 | #-- | 3 | #-- |
4 | # This file is a part of the mirimiri library | 4 | # This file is a part of the mirimiri library |
5 | # | 5 | # |
6 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | 6 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> |
7 | # | 7 | # |
8 | # This program is free software: you can redistribute it and/or modify | 8 | # This program is free software: you can redistribute it and/or modify |
9 | # it under the terms of the GNU General Public License as published by | 9 | # it under the terms of the GNU General Public License as published by |
10 | # the Free Software Foundation, either version 3 of the License, or | 10 | # the Free Software Foundation, either version 3 of the License, or |
11 | # (at your option) any later version. | 11 | # (at your option) any later version. |
12 | # | 12 | # |
13 | # This program is distributed in the hope that it will be useful, | 13 | # This program is distributed in the hope that it will be useful, |
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | # GNU General Public License for more details. | 16 | # GNU General Public License for more details. |
17 | # | 17 | # |
18 | # You should have received a copy of the GNU General Public License | 18 | # You should have received a copy of the GNU General Public License |
19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | 19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
20 | #++ | 20 | #++ |
21 | 21 | ||
22 | 22 | ||
23 | # General module | 23 | # General module |
24 | module Mirimiri | 24 | module Mirimiri |
25 | 25 | ||
26 | # A Document is a bag of words and is constructed from a string. | 26 | # A Document is a bag of words and is constructed from a string. |
27 | class Document | 27 | class Document |
28 | attr_reader :words, :doc_content, :count_words | 28 | attr_reader :words, :doc_content, :xcount |
29 | 29 | ||
30 | # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html | 30 | # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html |
31 | # and the \\W special escape). | 31 | # and the \\W special escape). |
32 | # | 32 | # |
33 | # Protected function, only meant to by called at the initialization. | 33 | # Protected function, only meant to by called at the initialization. |
34 | def format_words | 34 | def format_words |
35 | wo = [] | 35 | wo = [] |
36 | 36 | ||
37 | @doc_content.split.each do |w| | 37 | @doc_content.split.each do |w| |
38 | w.split(/\W/).each do |sw| | 38 | w.split(/\W/).each do |sw| |
39 | wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ | 39 | wo.push(sw.downcase) if sw =~ /[[:alpha:]]/ |
40 | end | 40 | end |
41 | end | 41 | end |
42 | 42 | ||
43 | wo | 43 | wo |
44 | end | 44 | end |
45 | 45 | ||
46 | # Returns an Array containing the +n+-grams (words) from the current Document. | 46 | # Returns an Array containing the +n+-grams (words) from the current Document. |
47 | # | 47 | # |
48 | # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] | 48 | # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] |
49 | def ngrams(n) | 49 | def ngrams(n) |
50 | window = [] | 50 | window = [] |
51 | ngrams_array = [] | 51 | ngrams_array = [] |
52 | 52 | ||
53 | if @ngrams[n].nil? | 53 | if @ngrams[n].nil? |
54 | @words.each do |w| | 54 | @words.each do |w| |
55 | window.push(w) | 55 | window.push(w) |
56 | if window.size == n | 56 | if window.size == n |
57 | ngrams_array.push window.join(" ") | 57 | ngrams_array.push window.join(" ") |
58 | window.delete_at(0) | 58 | window.delete_at(0) |
59 | end | 59 | end |
60 | end | 60 | end |
61 | @ngrams[n] = ngrams_array | 61 | @ngrams[n] = ngrams_array |
62 | end | 62 | end |
63 | 63 | ||
64 | @ngrams[n] | 64 | @ngrams[n] |
65 | end | 65 | end |
66 | 66 | ||
67 | # Returns a Hash containing the words and their associated counts in the current Document. | 67 | # Returns a Hash containing the words and their associated counts in the current Document. |
68 | # | 68 | # |
69 | # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } | 69 | # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } |
70 | def count_words | 70 | def count_words |
71 | counts = Hash.new { |h,k| h[k] = 0 } | 71 | counts = Hash.new { |h,k| h[k] = 0 } |
72 | @words.each { |w| counts[w] += 1 } | 72 | @words.each { |w| counts[w] += 1 } |
73 | 73 | ||
74 | counts | 74 | counts |
75 | end | 75 | end |
76 | 76 | ||
77 | # Old entropy function. | 77 | # Old entropy function. |
78 | # TODO: remove. | 78 | # TODO: remove. |
79 | def entropy0(s) | 79 | def entropy0(s) |
80 | en = 0.0 | 80 | en = 0.0 |
81 | 81 | ||
82 | s.split.each do |w| | 82 | s.split.each do |w| |
83 | p_wi = @count_words[w].to_f/@words.count.to_f | 83 | p_wi = @xcount[w].to_f/@words.count.to_f |
84 | en += p_wi*Math.log2(p_wi) | 84 | en += p_wi*Math.log2(p_wi) |
85 | end | 85 | end |
86 | 86 | ||
87 | en *= -1 | 87 | en *= -1 |
88 | en | 88 | en |
89 | end | 89 | end |
90 | 90 | ||
91 | # Computes the entropy of a given string +s+ inside the document. | 91 | # Computes the entropy of a given string +s+ inside the document. |
92 | # | 92 | # |
93 | # If the string parameter is composed of many words (i.e. tokens separated | 93 | # If the string parameter is composed of many words (i.e. tokens separated |
94 | # by whitespace(s)), it is considered as an ngram. | 94 | # by whitespace(s)), it is considered as an ngram. |
95 | # | 95 | # |
96 | # entropy("guitar") #=> 0.014348983965324762 | 96 | # entropy("guitar") #=> 0.014348983965324762 |
97 | # entropy("dillinger escape plan") #=> 0.054976093116768154 | 97 | # entropy("dillinger escape plan") #=> 0.054976093116768154 |
98 | def entropy(s) | 98 | def entropy(s) |
99 | en = 0.0 | 99 | en = 0.0 |
100 | 100 | ||
101 | size = s.split.size | 101 | size = s.split.size |
102 | 102 | ||
103 | if size == 1 | 103 | if size == 1 |
104 | p_wi = @count_words[s].to_f/@words.count.to_f | 104 | p_wi = @xcount[s].to_f/@words.count.to_f |
105 | en += p_wi*Math.log(p_wi) | 105 | en += p_wi*Math.log(p_wi) |
106 | elsif size > 1 | 106 | elsif size > 1 |
107 | ng_size = ngrams(size) | 107 | ng_size = ngrams(size) |
108 | p_wi = ng_size.count(s).to_f/ng_size.count.to_f | 108 | p_wi = ng_size.count(s).to_f/ng_size.count.to_f |
109 | en += p_wi*Math.log(p_wi) | 109 | en += p_wi*Math.log(p_wi) |
110 | end | 110 | end |
111 | 111 | ||
112 | en *= -1 | 112 | en *= -1 |
113 | en | 113 | en |
114 | end | 114 | end |
115 | 115 | ||
116 | # Computes the term frequency of a given *word* +s+. | 116 | # Computes the term frequency of a given *word* +s+. |
117 | # | 117 | # |
118 | # tf("guitar") #=> 0.000380372765310004 | 118 | # tf("guitar") #=> 0.000380372765310004 |
119 | def tf(s) | 119 | def tf(s) |
120 | @count_words[s].to_f/@words.size.to_f | 120 | @xcount[s].to_f/@words.size.to_f |
121 | end | 121 | end |
122 | 122 | ||
123 | # Computes the KL divergence between the language model of the +self+ | ||
124 | # and the language model of +doc+. | ||
125 | # | ||
126 | # KL is not symmetric, see http://en.wikipedia.org/wiki/Kullback-Leibler_divergence | ||
127 | # for more information. | ||
128 | # | ||
129 | # d1.kl(d2) #=> 0.2971808085725761 | ||
130 | def kl(doc) | ||
131 | raise ArgumentError, 'Argument is not a Mirimiri::Document' unless doc.is_a? Mirimiri::Document | ||
132 | |||
133 | vocab = self.words & doc.words | ||
123 | 134 | ||
135 | vocab.inject(0.0) { |res,w| res + self.tf(w)*Math.log(self.tf(w)/doc.tf(w)) } | ||
136 | end | ||
137 | |||
124 | def initialize(content="") | 138 | def initialize(content="") |
125 | @doc_content = content | 139 | @doc_content = content |
126 | @words = format_words | 140 | @words = format_words |
127 | @count_words = count_words | 141 | @xcount = count_words |
128 | @ngrams = {} | 142 | @ngrams = {} |
129 | end | 143 | end |
130 | 144 | ||
131 | protected :format_words, :count_words | 145 | protected :format_words, :count_words |
132 | end | 146 | end |
133 | 147 | ||
134 | # A WebDocument is a Document with a +url+. | 148 | # A WebDocument is a Document with a +url+. |
135 | class WebDocument < Document | 149 | class WebDocument < Document |
136 | attr_reader :url | 150 | attr_reader :url |
137 | 151 | ||
138 | # Returns the HTML text from the page of a given +url+. | 152 | # Returns the HTML text from the page of a given +url+. |
139 | def self.get_content(url) | 153 | def self.get_content(url) |
140 | require 'net/http' | 154 | require 'net/http' |
141 | Net::HTTP.get(URI.parse(url)) | 155 | Net::HTTP.get(URI.parse(url)) |
142 | end | 156 | end |
143 | 157 | ||
144 | 158 | ||
145 | # WebDocument constructor, the content of the Document is the HTML page | 159 | # WebDocument constructor, the content of the Document is the HTML page |
146 | # without the tags. | 160 | # without the tags. |
147 | def initialize(url,only_tags=nil) | 161 | def initialize(url,only_tags=nil) |
148 | require 'sanitize' | 162 | require 'sanitize' |
149 | 163 | ||
150 | @url = url | 164 | @url = url |
151 | content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("") | 165 | content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("") |
152 | super Sanitize.clean(content.unaccent.toutf8.force_encoding("UTF-8"), :remove_contents => ['script']) | 166 | super Sanitize.clean(content, :remove_contents => ['script','style']) |
153 | end | 167 | end |
154 | end | 168 | end |
155 | 169 | ||
156 | # A WikipediaPage is a WebDocument. | 170 | # A WikipediaPage is a WebDocument. |
157 | class WikipediaPage < WebDocument | 171 | class WikipediaPage < WebDocument |
158 | require 'rexml/document' | 172 | require 'rexml/document' |
159 | require 'net/http' | 173 | require 'net/http' |
160 | require 'kconv' | 174 | require 'kconv' |
161 | 175 | ||
162 | 176 | ||
163 | def self.search_wikipedia_titles(name) | 177 | def self.search_wikipedia_titles(name) |
164 | raise ArgumentError, "Bad encoding", name unless name.isutf8 | 178 | # raise ArgumentError, "Bad encoding", name unless name.isutf8 |
165 | 179 | ||
166 | res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).unaccent.toutf8).elements['api/query/search'] | 180 | res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&srlimit=20&format=xml" ).force_encoding("ISO-8859-1").encode("UTF-8")).elements['api/query/search'] |
167 | 181 | ||
168 | res.collect { |e| e.attributes['title'] } unless res.nil? | 182 | res.collect { |e| e.attributes['title'] } unless res.nil? |
169 | end | 183 | end |
170 | 184 | ||
171 | def self.get_url(name) | 185 | def self.get_url(name) |
172 | raise ArgumentError, "Bad encoding", name unless name.isutf8 | 186 | raise ArgumentError, "Bad encoding", name unless name.isutf8 |
173 | 187 | ||
174 | atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).unaccent.toutf8).elements['api/query/pages/page'].attributes | 188 | atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).unaccent.toutf8).elements['api/query/pages/page'].attributes |
175 | 189 | ||
176 | atts['fullurl'] if atts['missing'].nil? | 190 | atts['fullurl'] if atts['missing'].nil? |
177 | end | 191 | end |
178 | 192 | ||
179 | def self.search_homepage(name) | 193 | def self.search_homepage(name) |
180 | title = WikipediaPage.search_wikipedia_titles name | 194 | title = WikipediaPage.search_wikipedia_titles name |
181 | 195 | ||
182 | WikipediaPage.get_url(title[0]) unless title.nil? || title.empty? | 196 | WikipediaPage.get_url(title[0]) unless title.nil? || title.empty? |
183 | end | 197 | end |
184 | 198 | ||
185 | def self.extract_anchors(url) | 199 | def self.extract_anchors(url) |
186 | self.get_content(url).extract_xmltags_values('p').join(' ').scan(/<a href="(.+?)" title=.*?>(.+?)<\/a>/).delete_if { |a| a[0] =~ /^\/wiki\/.*$/.negated } | 200 | self.get_content(url).extract_xmltags_values('p').join(' ').scan(/<a href="(.+?)" title=.*?>(.+?)<\/a>/).delete_if { |a| a[0] =~ /^\/wiki\/.*$/.negated } |
187 | end | 201 | end |
188 | end | 202 | end |
189 | 203 | ||
190 | class FreebasePage < WebDocument | 204 | class FreebasePage < WebDocument |
191 | require 'net/http' | 205 | require 'net/http' |
192 | require 'kconv' | 206 | require 'kconv' |
193 | require 'json' | 207 | require 'json' |
194 | 208 | ||
195 | def self.search_article_ids query,limit | 209 | def self.search_article_ids query,limit |
196 | raise ArgumentError, "Bad encoding", name unless name.isutf8 | 210 | raise ArgumentError, "Bad encoding", name unless name.isutf8 |
197 | 211 | ||
198 | JSON.parse(Net::HTTP.get( URI.parse "http://api.freebase.com/api/service/search?query=#{query.gsub(" ","+")}&limit=#{limit}" ))['result'].collect { |a| a['article']['id'] unless a['article'].nil? }.compact | 212 | JSON.parse(Net::HTTP.get( URI.parse "http://api.freebase.com/api/service/search?query=#{query.gsub(" ","+")}&limit=#{limit}" ))['result'].collect { |a| a['article']['id'] unless a['article'].nil? }.compact |
199 | end | 213 | end |
200 | 214 | ||
201 | def self.get_url id | 215 | def self.get_url id |
202 | "http://api.freebase.com/api/trans/raw#{id}" | 216 | "http://api.freebase.com/api/trans/raw#{id}" |
203 | end | 217 | end |
204 | end | 218 | end |
205 | end | 219 | end |
206 | 220 |
lib/mirimiri/index.rb
1 | #!/usr/bin/env ruby | 1 | #!/usr/bin/env ruby |
2 | 2 | ||
3 | #-- | 3 | #-- |
4 | # This file is a part of the mirimiri library | 4 | # This file is a part of the mirimiri library |
5 | # | 5 | # |
6 | # Copyright (C) 2010-2012 Romain Deveaud <romain.deveaud@gmail.com> | 6 | # Copyright (C) 2010-2012 Romain Deveaud <romain.deveaud@gmail.com> |
7 | # | 7 | # |
8 | # This program is free software: you can redistribute it and/or modify | 8 | # This program is free software: you can redistribute it and/or modify |
9 | # it under the terms of the GNU General Public License as published by | 9 | # it under the terms of the GNU General Public License as published by |
10 | # the Free Software Foundation, either version 3 of the License, or | 10 | # the Free Software Foundation, either version 3 of the License, or |
11 | # (at your option) any later version. | 11 | # (at your option) any later version. |
12 | # | 12 | # |
13 | # This program is distributed in the hope that it will be useful, | 13 | # This program is distributed in the hope that it will be useful, |
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | # GNU General Public License for more details. | 16 | # GNU General Public License for more details. |
17 | # | 17 | # |
18 | # You should have received a copy of the GNU General Public License | 18 | # You should have received a copy of the GNU General Public License |
19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | 19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
20 | #++ | 20 | #++ |
21 | 21 | ||
22 | class Index | 22 | class Index |
23 | end | 23 | end |
24 | 24 | ||
25 | module Indri | 25 | module Indri |
26 | 26 | ||
27 | class IndriIndex < Index | 27 | class IndriIndex < Index |
28 | 28 | ||
29 | def initialize path | 29 | def initialize path |
30 | raise ArgumentError, 'Index path does not exist' unless File.directory? path | 30 | raise ArgumentError, 'Index path does not exist' unless File.directory? path |
31 | @path = path | 31 | @path = path |
32 | end | 32 | end |
33 | 33 | ||
34 | def runquery indriquery | 34 | def runquery indriquery |
35 | raise ArgumentError, 'Argument is not an IndriQuery' unless indriquery.is_a? Indri::IndriQuery | 35 | raise ArgumentError, 'Argument is not an Indri::IndriQuery' unless indriquery.is_a? Indri::IndriQuery |
36 | 36 | ||
37 | query = "IndriRunQuery -query=\"#{indriquery.query}\" -index=#{@path}" | 37 | query = "IndriRunQuery -query=\"#{indriquery.query}\" -index=#{@path}" |
38 | 38 | ||
39 | query += " -count=#{indriquery.count}" unless indriquery.count.nil? | 39 | query += " -count=#{indriquery.count}" unless indriquery.count.nil? |
40 | query += " -rule=method:#{indriquery.sm_method},#{indriquery.sm_param}:#{indriquery.sm_value}" unless indriquery.sm_method.nil? | 40 | query += " -rule=method:#{indriquery.sm_method},#{indriquery.sm_param}:#{indriquery.sm_value}" unless indriquery.sm_method.nil? |
41 | query += " #{indriquery.args}" unless indriquery.args.nil? | 41 | query += " #{indriquery.args}" unless indriquery.args.nil? |
42 | 42 | ||
43 | res = `#{query}` | 43 | res = `#{query}` |
44 | 44 | ||
45 | res | 45 | res |
46 | end | 46 | end |
47 | end | 47 | end |
48 | end | 48 | end |
49 | 49 |
lib/mirimiri/query.rb
1 | #!/usr/bin/env ruby | 1 | #!/usr/bin/env ruby |
2 | 2 | ||
3 | #-- | 3 | #-- |
4 | # This file is a part of the mirimiri library | 4 | # This file is a part of the mirimiri library |
5 | # | 5 | # |
6 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | 6 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> |
7 | # | 7 | # |
8 | # This program is free software: you can redistribute it and/or modify | 8 | # This program is free software: you can redistribute it and/or modify |
9 | # it under the terms of the GNU General Public License as published by | 9 | # it under the terms of the GNU General Public License as published by |
10 | # the Free Software Foundation, either version 3 of the License, or | 10 | # the Free Software Foundation, either version 3 of the License, or |
11 | # (at your option) any later version. | 11 | # (at your option) any later version. |
12 | # | 12 | # |
13 | # This program is distributed in the hope that it will be useful, | 13 | # This program is distributed in the hope that it will be useful, |
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | # GNU General Public License for more details. | 16 | # GNU General Public License for more details. |
17 | # | 17 | # |
18 | # You should have received a copy of the GNU General Public License | 18 | # You should have received a copy of the GNU General Public License |
19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | 19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
20 | #++ | 20 | #++ |
21 | 21 | ||
22 | class Query | 22 | class Query |
23 | attr_accessor :query | ||
24 | |||
25 | |||
23 | end | 26 | end |
24 | 27 | ||
25 | module Indri | 28 | module Indri |
26 | 29 | ||
27 | class Parameters | 30 | class Parameters |
28 | attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline | 31 | attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline |
29 | 32 | ||
30 | def initialize(corpus,count="1000",mem="1g",threads="1",offset="1",run_id="default",print_query=false,print_docs=false) | 33 | def initialize(corpus,count="1000",mem="1g",threads="1",offset="1",run_id="default",print_passages=false,print_query=false,print_docs=false) |
31 | @index_path = corpus | 34 | @index_path = corpus |
32 | @memory = mem | 35 | @memory = mem |
33 | @count = count | 36 | @count = count |
34 | @threads = threads | 37 | @threads = threads |
35 | @offset = offset | 38 | @offset = offset |
36 | @run_id = run_id | 39 | @run_id = run_id |
37 | @print_query = print_query ? "true" : "false" | 40 | @print_query = print_query ? "true" : "false" |
38 | @print_docs = print_docs ? "true" : "false" | 41 | @print_docs = print_docs ? "true" : "false" |
42 | @print_passages = print_passages ? "true" : "false" | ||
43 | @indexes = [corpus] | ||
39 | end | 44 | end |
40 | 45 | ||
41 | def to_s | 46 | def to_s |
42 | h = "<memory>#{@memory}</memory>\n" | 47 | h = "<memory>#{@memory}</memory>\n" |
43 | h += "<index>#{@index_path}</index>\n" | 48 | @indexes.each do |i| |
49 | h += "<index>#{i}</index>\n" | ||
50 | end | ||
44 | h += "<count>#{@count}</count>\n" | 51 | h += "<count>#{@count}</count>\n" |
45 | h += "<threads>#{@threads}</threads>\n" | 52 | h += "<threads>#{@threads}</threads>\n" |
46 | unless @baseline.nil? | 53 | unless @baseline.nil? |
47 | h += "<baseline>#{@baseline}</baseline>\n" | 54 | h += "<baseline>#{@baseline}</baseline>\n" |
48 | else | 55 | else |
49 | h += "<rule>#{@rule}</rule>\n" | 56 | h += "<rule>#{@rule}</rule>\n" |
50 | end | 57 | end |
51 | h += "<trecFormat>true</trecFormat>\n" | 58 | h += "<trecFormat>true</trecFormat>\n" |
52 | h += "<queryOffset>#{@offset}</queryOffset>\n" | 59 | h += "<queryOffset>#{@offset}</queryOffset>\n" |
53 | h += "<runID>#{@run_id}</runID>\n" | 60 | h += "<runID>#{@run_id}</runID>\n" |
61 | h += "<printPassages>#{@print_passages}</printPassages>\n" | ||
54 | h += "<printQuery>#{@print_query}</printQuery>\n" | 62 | h += "<printQuery>#{@print_query}</printQuery>\n" |
55 | h += "<printDocuments>#{@print_docs}</printDocuments>\n" | 63 | h += "<printDocuments>#{@print_docs}</printDocuments>\n" |
56 | 64 | ||
57 | h | 65 | h |
58 | end | 66 | end |
67 | |||
68 | def add_index path | ||
69 | @indexes << path | ||
70 | end | ||
59 | end | 71 | end |
60 | 72 | ||
61 | class IndriQueryOld < Query | 73 | class IndriQueryOld < Query |
62 | attr_accessor :id, :query, :rule | 74 | attr_accessor :id, :query, :rule |
63 | 75 | ||
64 | def initialize(id,query) | 76 | def initialize(id,query) |
65 | @id = id | 77 | @id = id |
66 | @query = query | 78 | @query = query |
67 | end | 79 | end |
68 | 80 | ||
69 | def to_s | 81 | def to_s |
70 | h = "<query>\n" | 82 | h = "<query>\n" |
71 | h += "<number>#{@id}</number>\n" | 83 | h += "<number>#{@id}</number>\n" |
72 | h += "<text>#{@query}</text>\n" | 84 | h += "<text>#{@query}</text>\n" |
73 | h += "</query>\n" | 85 | h += "</query>\n" |
74 | 86 | ||
75 | h | 87 | h |
76 | end | 88 | end |
77 | 89 | ||
78 | def exec params | 90 | def exec params |
79 | `IndriRunQuery -query='#{@query}' -index=#{params.index_path} -count=#{params.count} -rule=method:dirichlet,mu:2500 -trecFormat` | 91 | `IndriRunQuery -query='#{@query}' -index=#{params.index_path} -count=#{params.count} -rule=method:dirichlet,mu:2500 -trecFormat` |
80 | end | 92 | end |
81 | end | 93 | end |
82 | 94 | ||
83 | class IndriQuery < Query | 95 | class IndriQuery < Query |
84 | attr_accessor :query, :count, :sm_method, :sm_param, :sm_value, :args | 96 | attr_accessor :query, :count, :sm_method, :sm_param, :sm_value, :args |
85 | 97 | ||
86 | def initialize atts={},args=nil | 98 | def initialize atts={},args=nil |
87 | raise ArgumentError, 'Argument 1 must be a Hash' unless atts.is_a? Hash | 99 | raise ArgumentError, 'Argument 1 must be a Hash' unless atts.is_a? Hash |
88 | atts.each do |k,v| | 100 | atts.each do |k,v| |
89 | instance_variable_set("@#{k}", v) unless v.nil? | 101 | instance_variable_set("@#{k}", v) unless v.nil? |
90 | end | 102 | end |
91 | 103 | ||
92 | raise ArgumentError, 'Argument 2 must be a String' unless (args.is_a?(String) || args.nil?) | 104 | raise ArgumentError, 'Argument 2 must be a String' unless (args.is_a?(String) || args.nil?) |
93 | @args = args | 105 | @args = args |
106 | end | ||
107 | |||
108 | def clarity index_path,terms=10,documents=5 | ||
109 | `clarity -index=#{index_path} -documents=#{documents} -terms=#{terms} -smoothing=\"method:#{@sm_method},#{@sm_param}:#{@sm_value}\" -query=\"#{query}\"`.split("=").last.strip | ||
94 | end | 110 | end |
95 | end | 111 | end |
96 | 112 | ||
97 | class IndriQueries | 113 | class IndriQueries |
98 | attr_accessor :params, :queries | 114 | attr_accessor :params, :queries |
99 | 115 | ||
100 | def initialize params | 116 | def initialize params |
101 | # @queries = queries | 117 | # @queries = queries |
102 | 118 | ||
103 | @params = params | 119 | @params = params |
104 | @queries = {} | 120 | @queries = {} |
105 | # Here we set the default retrieval model as Language Modeling | 121 | # Here we set the default retrieval model as Language Modeling |
106 | # with a Dirichlet smoothing at 2500. | 122 | # with a Dirichlet smoothing at 2500. |
107 | # TODO: maybe a Rule class... | 123 | # TODO: maybe a Rule class... |
108 | @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? | 124 | @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? |
109 | end | 125 | end |
110 | 126 | ||
111 | def push id,query | 127 | def push id,query |
112 | @queries[id.to_i] = query | 128 | @queries[id.to_i] = query |
113 | end | 129 | end |
114 | 130 | ||
115 | def to_s | 131 | def to_s |
116 | h = "<parameters>\n" | 132 | h = "<parameters>\n" |
117 | h += @params.to_s | 133 | h += @params.to_s |
118 | h += @queries.sort { |a,b| a[0] <=> b[0] }.collect do |q| | 134 | h += @queries.sort { |a,b| a[0] <=> b[0] }.collect do |q| |
119 | "<query>\n" + | 135 | "<query>\n" + |
120 | "<number>#{q[0]}</number>\n" + | 136 | "<number>#{q[0]}</number>\n" + |
121 | "<text>#{q[1]}</text>\n" + | 137 | "<text>#{q[1]}</text>\n" + |
122 | "</query>\n" | 138 | "</query>\n" |
123 | end.join "" | 139 | end.join "" |
124 | # h += @queries.collect { |q| q.to_s }.join "" | 140 | # h += @queries.collect { |q| q.to_s }.join "" |
125 | h += "</parameters>" | 141 | h += "</parameters>" |
126 | 142 | ||
127 | h | 143 | h |
128 | end | 144 | end |
129 | end | 145 | end |
130 | 146 | ||
131 | end | 147 | end |
132 | 148 |
lib/mirimiri/result.rb
File was created | 1 | #!/usr/bin/env ruby | |
2 | |||
3 | #-- | ||
4 | # This file is a part of the mirimiri library | ||
5 | # | ||
6 | # Copyright (C) 2010-2012 Romain Deveaud <romain.deveaud@gmail.com> | ||
7 | # | ||
8 | # This program is free software: you can redistribute it and/or modify | ||
9 | # it under the terms of the GNU General Public License as published by | ||
10 | # the Free Software Foundation, either version 3 of the License, or | ||
11 | # (at your option) any later version. | ||
12 | # | ||
13 | # This program is distributed in the hope that it will be useful, | ||
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | # GNU General Public License for more details. | ||
17 | # | ||
18 | # You should have received a copy of the GNU General Public License | ||
19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | #++ | ||
21 | |||
22 | module Mirimiri | ||
23 | |||
24 | # This class represents one line of a TREC-formatted retrieval | ||
25 | # result. Typical output of Indri or Terrier. | ||
26 | class TrecResult | ||
27 | attr_accessor :topic, :doc, :rank, :score, :run | ||
28 | |||
29 | def initialize arg | ||
30 | t = arg.split | ||
31 | @topic = t[0] | ||
32 | @doc = t[2] | ||
33 | @rank = t[3] | ||
34 | @score = t[4] | ||
35 | @run = t[5] | ||
36 | end | ||
37 | end | ||
38 | |||
39 | # This class represents the output of trec_eval, when | ||
40 | # -q option is given. | ||
41 | class TrecEval | ||
42 | attr_accessor :metric, :run, :queries | ||
43 | |||
44 | def initialize arg | ||
45 | @queries = {} | ||
46 | |||
47 | arg.each_line do |line| | ||
48 | t = line.split | ||
49 | @metric = t[0] if @metric.nil? | ||
50 | @queries[t[1]] = t[2].to_f if t[1].is_integer? | ||
51 | end | ||
52 | end | ||
53 | end | ||
54 | |||
55 | # An array of TrecResult, or a run. | ||
56 | class TrecResults < Array | ||
57 | |||
58 | def initialize args | ||
59 | super args.collect { |res| TrecResult.new res } | ||
60 | end | ||
61 | end | ||
62 | end | ||
63 |
lib/mirimiri/string.rb
1 | #!/usr/bin/env ruby | 1 | #!/usr/bin/env ruby |
2 | 2 | ||
3 | #-- | 3 | #-- |
4 | # This file is a part of the mirimiri library | 4 | # This file is a part of the mirimiri library |
5 | # | 5 | # |
6 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | 6 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> |
7 | # | 7 | # |
8 | # This program is free software: you can redistribute it and/or modify | 8 | # This program is free software: you can redistribute it and/or modify |
9 | # it under the terms of the GNU General Public License as published by | 9 | # it under the terms of the GNU General Public License as published by |
10 | # the Free Software Foundation, either version 3 of the License, or | 10 | # the Free Software Foundation, either version 3 of the License, or |
11 | # (at your option) any later version. | 11 | # (at your option) any later version. |
12 | # | 12 | # |
13 | # This program is distributed in the hope that it will be useful, | 13 | # This program is distributed in the hope that it will be useful, |
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | # GNU General Public License for more details. | 16 | # GNU General Public License for more details. |
17 | # | 17 | # |
18 | # You should have received a copy of the GNU General Public License | 18 | # You should have received a copy of the GNU General Public License |
19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | 19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
20 | #++ | 20 | #++ |
21 | 21 | ||
22 | module Mirimiri | 22 | module Mirimiri |
23 | 23 | ||
24 | # These are the default stopwords provided by Lemur. | 24 | # These are the default stopwords provided by Lemur. |
25 | Stoplist = [ | 25 | Stoplist = [ |
26 | "a","about","above","according","across","after","afterwards","again","against", | 26 | "a","about","above","according","across","after","afterwards","again","against", |
27 | "albeit","all","almost","alone","along","already","also","although","always","am", | 27 | "albeit","all","almost","alone","along","already","also","although","always","am", |
28 | "among","amongst","an","and","another","any","anybody","anyhow","anyone","anything", | 28 | "among","amongst","an","and","another","any","anybody","anyhow","anyone","anything", |
29 | "anyway","anywhere","apart","are","around","as","at","av","be","became","because", | 29 | "anyway","anywhere","apart","are","around","as","at","av","be","became","because", |
30 | "become","becomes","becoming","been","before","beforehand","behind","being","below", | 30 | "become","becomes","becoming","been","before","beforehand","behind","being","below", |
31 | "beside","besides","between","beyond","both","but","by","can","cannot","canst", | 31 | "beside","besides","between","beyond","both","but","by","can","cannot","canst", |
32 | "certain","cf","choose","contrariwise","cos","could","cu","day","do","does","doesn't", | 32 | "certain","cf","choose","contrariwise","cos","could","cu","day","do","does","doesn't", |
33 | "doing","dost","doth","double","down","dual","during","each","either","else", | 33 | "doing","dost","doth","double","down","dual","during","each","either","else", |
34 | "elsewhere","enough","et","etc","even","ever","every","everybody","everyone", | 34 | "elsewhere","enough","et","etc","even","ever","every","everybody","everyone", |
35 | "everything","everywhere","except","excepted","excepting","exception","exclude", | 35 | "everything","everywhere","except","excepted","excepting","exception","exclude", |
36 | "excluding","exclusive","far","farther","farthest","few","ff","first","for", | 36 | "excluding","exclusive","far","farther","farthest","few","ff","first","for", |
37 | "formerly","forth","forward","from","front","further","furthermore","furthest","get", | 37 | "formerly","forth","forward","from","front","further","furthermore","furthest","get", |
38 | "go","had","halves","hardly","has","hast","hath","have","he","hence","henceforth", | 38 | "go","had","halves","hardly","has","hast","hath","have","he","hence","henceforth", |
39 | "her","here","hereabouts","hereafter","hereby","herein","hereto","hereupon","hers", | 39 | "her","here","hereabouts","hereafter","hereby","herein","hereto","hereupon","hers", |
40 | "herself","him","himself","hindmost","his","hither","hitherto","how","however", | 40 | "herself","him","himself","hindmost","his","hither","hitherto","how","however", |
41 | "howsoever","i","ie","if","in","inasmuch","inc","include","included","including", | 41 | "howsoever","i","ie","if","in","inasmuch","inc","include","included","including", |
42 | "indeed","indoors","inside","insomuch","instead","into","inward","inwards","is", | 42 | "indeed","indoors","inside","insomuch","instead","into","inward","inwards","is", |
43 | "it","its","itself","just","kind","kg","km","last","latter","latterly","less","lest", | 43 | "it","its","itself","just","kind","kg","km","last","latter","latterly","less","lest", |
44 | "let","like","little","ltd","many","may","maybe","me","meantime","meanwhile","might", | 44 | "let","like","little","ltd","many","may","maybe","me","meantime","meanwhile","might", |
45 | "moreover","most","mostly","more","mr","mrs","ms","much","must","my","myself", | 45 | "moreover","most","mostly","more","mr","mrs","ms","much","must","my","myself", |
46 | "namely","need","neither","never","nevertheless","next","no","nobody","none", | 46 | "namely","need","neither","never","nevertheless","next","no","nobody","none", |
47 | "nonetheless","noone","nope","nor","not","nothing","notwithstanding","now","nowadays", | 47 | "nonetheless","noone","nope","nor","not","nothing","notwithstanding","now","nowadays", |
48 | "nowhere","of","off","often","ok","on","once","one","only","onto","or","other", | 48 | "nowhere","of","off","often","ok","on","once","one","only","onto","or","other", |
49 | "others","otherwise","ought","our","ours","ourselves","out","outside","over","own", | 49 | "others","otherwise","ought","our","ours","ourselves","out","outside","over","own", |
50 | "per","perhaps","plenty","provide","quite","rather","really","round","said","sake", | 50 | "per","perhaps","plenty","provide","quite","rather","really","round","said","sake", |
51 | "same","sang","save","saw","see","seeing","seem","seemed","seeming","seems","seen", | 51 | "same","sang","save","saw","see","seeing","seem","seemed","seeming","seems","seen", |
52 | "seldom","selves","sent","several","shalt","she","should","shown","sideways","since", | 52 | "seldom","selves","sent","several","shalt","she","should","shown","sideways","since", |
53 | "slept","slew","slung","slunk","smote","so","some","somebody","somehow","someone", | 53 | "slept","slew","slung","slunk","smote","so","some","somebody","somehow","someone", |
54 | "something","sometime","sometimes","somewhat","somewhere","spake","spat","spoke", | 54 | "something","sometime","sometimes","somewhat","somewhere","spake","spat","spoke", |
55 | "spoken","sprang","sprung","stave","staves","still","such","supposing","than","that", | 55 | "spoken","sprang","sprung","stave","staves","still","such","supposing","than","that", |
56 | "the","thee","their","them","themselves","then","thence","thenceforth","there", | 56 | "the","thee","their","them","themselves","then","thence","thenceforth","there", |
57 | "thereabout","thereabouts","thereafter","thereby","therefore","therein","thereof", | 57 | "thereabout","thereabouts","thereafter","thereby","therefore","therein","thereof", |
58 | "thereon","thereto","thereupon","these","they","this","those","thou","though", | 58 | "thereon","thereto","thereupon","these","they","this","those","thou","though", |
59 | "thrice","through","throughout","thru","thus","thy","thyself","till","to","together", | 59 | "thrice","through","throughout","thru","thus","thy","thyself","till","to","together", |
60 | "too","toward","towards","ugh","unable","under","underneath","unless","unlike", | 60 | "too","toward","towards","ugh","unable","under","underneath","unless","unlike", |
61 | "until","up","upon","upward","upwards","us","use","used","using","very","via","vs", | 61 | "until","up","upon","upward","upwards","us","use","used","using","very","via","vs", |
62 | "want","was","we","week","well","were","what","whatever","whatsoever","when","whence", | 62 | "want","was","we","week","well","were","what","whatever","whatsoever","when","whence", |
63 | "whenever","whensoever","where","whereabouts","whereafter","whereas","whereat", | 63 | "whenever","whensoever","where","whereabouts","whereafter","whereas","whereat", |
64 | "whereby","wherefore","wherefrom","wherein","whereinto","whereof","whereon", | 64 | "whereby","wherefore","wherefrom","wherein","whereinto","whereof","whereon", |
65 | "wheresoever","whereto","whereunto","whereupon","wherever","wherewith","whether", | 65 | "wheresoever","whereto","whereunto","whereupon","wherever","wherewith","whether", |
66 | "whew","which","whichever","whichsoever","while","whilst","whither","who","whoa", | 66 | "whew","which","whichever","whichsoever","while","whilst","whither","who","whoa", |
67 | "whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will", | 67 | "whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will", |
68 | "wilt","with","within","without","worse","worst","would","wow","ye","yet","year", | 68 | "wilt","with","within","without","worse","worst","would","wow","ye","yet","year", |
69 | "yippee","you","your","yours","yourself","yourselves", | 69 | "yippee","you","your","yours","yourself","yourselves", |
70 | "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en","html" | 70 | "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en","html", |
71 | "amp","nbsp","quot" | ||
71 | ] | 72 | ] |
72 | 73 | ||
73 | Transmap = { | 74 | Transmap = { |
74 | "\xC3\x80" => "A", "\xC3\x81" => "A", "\xC3\x82" => "A", "\xC3\x83" => "A", | 75 | "\xC3\x80" => "A", "\xC3\x81" => "A", "\xC3\x82" => "A", "\xC3\x83" => "A", |
75 | "\xC3\x84" => "A", "\xC3\x85" => "A", "\xC3\x86" => "AE","\xC3\x87" => "C", | 76 | "\xC3\x84" => "A", "\xC3\x85" => "A", "\xC3\x86" => "AE","\xC3\x87" => "C", |
76 | "\xC3\x88" => "E", "\xC3\x89" => "E", "\xC3\x8A" => "E", "\xC3\x8B" => "E", | 77 | "\xC3\x88" => "E", "\xC3\x89" => "E", "\xC3\x8A" => "E", "\xC3\x8B" => "E", |
77 | "\xC3\x8C" => "I", "\xC3\x8D" => "I", "\xC3\x8E" => "I", "\xC3\x8F" => "I", | 78 | "\xC3\x8C" => "I", "\xC3\x8D" => "I", "\xC3\x8E" => "I", "\xC3\x8F" => "I", |
78 | "\xC3\x90" => "D", "\xC3\x91" => "N", "\xC3\x92" => "O", "\xC3\x93" => "O", | 79 | "\xC3\x90" => "D", "\xC3\x91" => "N", "\xC3\x92" => "O", "\xC3\x93" => "O", |
79 | "\xC3\x94" => "O", "\xC3\x95" => "O", "\xC3\x96" => "O", "\xC3\x98" => "O", | 80 | "\xC3\x94" => "O", "\xC3\x95" => "O", "\xC3\x96" => "O", "\xC3\x98" => "O", |
80 | "\xC3\x99" => "U", "\xC3\x9A" => "U", "\xC3\x9B" => "U", "\xC3\x9C" => "U", | 81 | "\xC3\x99" => "U", "\xC3\x9A" => "U", "\xC3\x9B" => "U", "\xC3\x9C" => "U", |
81 | "\xC3\x9D" => "Y", "\xC3\x9E" => "P", "\xC3\x9F" => "ss", | 82 | "\xC3\x9D" => "Y", "\xC3\x9E" => "P", "\xC3\x9F" => "ss", |
82 | "\xC3\xA0" => "a", "\xC3\xA1" => "a", "\xC3\xA2" => "a", "\xC3\xA3" => "a", | 83 | "\xC3\xA0" => "a", "\xC3\xA1" => "a", "\xC3\xA2" => "a", "\xC3\xA3" => "a", |
83 | "\xC3\xA4" => "a", "\xC3\xA5" => "a", "\xC3\xA6" => "ae","\xC3\xA7" => "c", | 84 | "\xC3\xA4" => "a", "\xC3\xA5" => "a", "\xC3\xA6" => "ae","\xC3\xA7" => "c", |
84 | "\xC3\xA8" => "e", "\xC3\xA9" => "e", "\xC3\xAA" => "e", "\xC3\xAB" => "e", | 85 | "\xC3\xA8" => "e", "\xC3\xA9" => "e", "\xC3\xAA" => "e", "\xC3\xAB" => "e", |
85 | "\xC3\xAC" => "i", "\xC3\xAD" => "i", "\xC3\xAE" => "i", "\xC3\xAF" => "i", | 86 | "\xC3\xAC" => "i", "\xC3\xAD" => "i", "\xC3\xAE" => "i", "\xC3\xAF" => "i", |
86 | "\xC3\xB0" => "o", "\xC3\xB1" => "n", "\xC3\xB2" => "o", "\xC3\xB3" => "o", | 87 | "\xC3\xB0" => "o", "\xC3\xB1" => "n", "\xC3\xB2" => "o", "\xC3\xB3" => "o", |
87 | "\xC3\xB4" => "o", "\xC3\xB5" => "o", "\xC3\xB6" => "o", "\xC3\xB8" => "o", | 88 | "\xC3\xB4" => "o", "\xC3\xB5" => "o", "\xC3\xB6" => "o", "\xC3\xB8" => "o", |
88 | "\xC3\xB9" => "u", "\xC3\xBA" => "u", "\xC3\xBB" => "u", "\xC3\xBC" => "u", | 89 | "\xC3\xB9" => "u", "\xC3\xBA" => "u", "\xC3\xBB" => "u", "\xC3\xBC" => "u", |
89 | "\xC3\xBD" => "y", "\xC3\xBE" => "p", "\xC3\xBF" => "y", | 90 | "\xC3\xBD" => "y", "\xC3\xBE" => "p", "\xC3\xBF" => "y", |
90 | "\xC4\x80" => "A", "\xC4\x81" => "a", "\xC4\x82" => "A", "\xC4\x83" => "a", | 91 | "\xC4\x80" => "A", "\xC4\x81" => "a", "\xC4\x82" => "A", "\xC4\x83" => "a", |
91 | "\xC4\x84" => "A", "\xC4\x85" => "a", "\xC4\x86" => "C", "\xC4\x87" => "c", | 92 | "\xC4\x84" => "A", "\xC4\x85" => "a", "\xC4\x86" => "C", "\xC4\x87" => "c", |
92 | "\xC4\x88" => "C", "\xC4\x89" => "c", "\xC4\x8A" => "C", "\xC4\x8B" => "c", | 93 | "\xC4\x88" => "C", "\xC4\x89" => "c", "\xC4\x8A" => "C", "\xC4\x8B" => "c", |
93 | "\xC4\x8C" => "C", "\xC4\x8D" => "c", "\xC4\x8E" => "D", "\xC4\x8F" => "d", | 94 | "\xC4\x8C" => "C", "\xC4\x8D" => "c", "\xC4\x8E" => "D", "\xC4\x8F" => "d", |
94 | "\xC4\x90" => "D", "\xC4\x91" => "d", "\xC4\x92" => "E", "\xC4\x93" => "e", | 95 | "\xC4\x90" => "D", "\xC4\x91" => "d", "\xC4\x92" => "E", "\xC4\x93" => "e", |
95 | "\xC4\x94" => "E", "\xC4\x95" => "e", "\xC4\x96" => "E", "\xC4\x97" => "e", | 96 | "\xC4\x94" => "E", "\xC4\x95" => "e", "\xC4\x96" => "E", "\xC4\x97" => "e", |
96 | "\xC4\x98" => "E", "\xC4\x99" => "e", "\xC4\x9A" => "E", "\xC4\x9B" => "e", | 97 | "\xC4\x98" => "E", "\xC4\x99" => "e", "\xC4\x9A" => "E", "\xC4\x9B" => "e", |
97 | "\xC4\x9C" => "G", "\xC4\x9D" => "g", "\xC4\x9E" => "G", "\xC4\x9F" => "g", | 98 | "\xC4\x9C" => "G", "\xC4\x9D" => "g", "\xC4\x9E" => "G", "\xC4\x9F" => "g", |
98 | "\xC4\xA0" => "G", "\xC4\xA1" => "g", "\xC4\xA2" => "G", "\xC4\xA3" => "g", | 99 | "\xC4\xA0" => "G", "\xC4\xA1" => "g", "\xC4\xA2" => "G", "\xC4\xA3" => "g", |
99 | "\xC4\xA4" => "H", "\xC4\xA5" => "h", "\xC4\xA6" => "H", "\xC4\xA7" => "h", | 100 | "\xC4\xA4" => "H", "\xC4\xA5" => "h", "\xC4\xA6" => "H", "\xC4\xA7" => "h", |
100 | "\xC4\xA8" => "I", "\xC4\xA9" => "i", "\xC4\xAA" => "I", "\xC4\xAB" => "i", | 101 | "\xC4\xA8" => "I", "\xC4\xA9" => "i", "\xC4\xAA" => "I", "\xC4\xAB" => "i", |
101 | "\xC4\xAC" => "I", "\xC4\xAD" => "i", "\xC4\xAE" => "I", "\xC4\xAF" => "i", | 102 | "\xC4\xAC" => "I", "\xC4\xAD" => "i", "\xC4\xAE" => "I", "\xC4\xAF" => "i", |
102 | "\xC4\xB0" => "I", "\xC4\xB1" => "i", "\xC4\xB2" => "IJ","\xC4\xB3" => "ij", | 103 | "\xC4\xB0" => "I", "\xC4\xB1" => "i", "\xC4\xB2" => "IJ","\xC4\xB3" => "ij", |
103 | "\xC4\xB4" => "J", "\xC4\xB5" => "j", "\xC4\xB6" => "K", "\xC4\xB7" => "k", | 104 | "\xC4\xB4" => "J", "\xC4\xB5" => "j", "\xC4\xB6" => "K", "\xC4\xB7" => "k", |
104 | "\xC4\xB8" => "k", "\xC4\xB9" => "L", "\xC4\xBA" => "l", "\xC4\xBB" => "L", | 105 | "\xC4\xB8" => "k", "\xC4\xB9" => "L", "\xC4\xBA" => "l", "\xC4\xBB" => "L", |
105 | "\xC4\xBC" => "l", "\xC4\xBD" => "L", "\xC4\xBE" => "l", "\xC4\xBF" => "L", | 106 | "\xC4\xBC" => "l", "\xC4\xBD" => "L", "\xC4\xBE" => "l", "\xC4\xBF" => "L", |
106 | "\xC5\x80" => "l", "\xC5\x81" => "L", "\xC5\x82" => "l", "\xC5\x83" => "N", | 107 | "\xC5\x80" => "l", "\xC5\x81" => "L", "\xC5\x82" => "l", "\xC5\x83" => "N", |
107 | "\xC5\x84" => "n", "\xC5\x85" => "N", "\xC5\x86" => "n", "\xC5\x87" => "N", | 108 | "\xC5\x84" => "n", "\xC5\x85" => "N", "\xC5\x86" => "n", "\xC5\x87" => "N", |
108 | "\xC5\x88" => "n", "\xC5\x89" => "n", "\xC5\x8A" => "N", "\xC5\x8B" => "n", | 109 | "\xC5\x88" => "n", "\xC5\x89" => "n", "\xC5\x8A" => "N", "\xC5\x8B" => "n", |
109 | "\xC5\x8C" => "O", "\xC5\x8D" => "o", "\xC5\x8E" => "O", "\xC5\x8F" => "o", | 110 | "\xC5\x8C" => "O", "\xC5\x8D" => "o", "\xC5\x8E" => "O", "\xC5\x8F" => "o", |
110 | "\xC5\x90" => "O", "\xC5\x91" => "o", "\xC5\x92" => "CE","\xC5\x93" => "ce", | 111 | "\xC5\x90" => "O", "\xC5\x91" => "o", "\xC5\x92" => "CE","\xC5\x93" => "ce", |
111 | "\xC5\x94" => "R", "\xC5\x95" => "r", "\xC5\x96" => "R", "\xC5\x97" => "r", | 112 | "\xC5\x94" => "R", "\xC5\x95" => "r", "\xC5\x96" => "R", "\xC5\x97" => "r", |
112 | "\xC5\x98" => "R", "\xC5\x99" => "r", "\xC5\x9A" => "S", "\xC5\x9B" => "s", | 113 | "\xC5\x98" => "R", "\xC5\x99" => "r", "\xC5\x9A" => "S", "\xC5\x9B" => "s", |
113 | "\xC5\x9C" => "S", "\xC5\x9D" => "s", "\xC5\x9E" => "S", "\xC5\x9F" => "s", | 114 | "\xC5\x9C" => "S", "\xC5\x9D" => "s", "\xC5\x9E" => "S", "\xC5\x9F" => "s", |
114 | "\xC5\xA0" => "S", "\xC5\xA1" => "s", "\xC5\xA2" => "T", "\xC5\xA3" => "t", | 115 | "\xC5\xA0" => "S", "\xC5\xA1" => "s", "\xC5\xA2" => "T", "\xC5\xA3" => "t", |
115 | "\xC5\xA4" => "T", "\xC5\xA5" => "t", "\xC5\xA6" => "T", "\xC5\xA7" => "t", | 116 | "\xC5\xA4" => "T", "\xC5\xA5" => "t", "\xC5\xA6" => "T", "\xC5\xA7" => "t", |
116 | "\xC5\xA8" => "U", "\xC5\xA9" => "u", "\xC5\xAA" => "U", "\xC5\xAB" => "u", | 117 | "\xC5\xA8" => "U", "\xC5\xA9" => "u", "\xC5\xAA" => "U", "\xC5\xAB" => "u", |
117 | "\xC5\xAC" => "U", "\xC5\xAD" => "u", "\xC5\xAE" => "U", "\xC5\xAF" => "u", | 118 | "\xC5\xAC" => "U", "\xC5\xAD" => "u", "\xC5\xAE" => "U", "\xC5\xAF" => "u", |
118 | "\xC5\xB0" => "U", "\xC5\xB1" => "u", "\xC5\xB2" => "U", "\xC5\xB3" => "u", | 119 | "\xC5\xB0" => "U", "\xC5\xB1" => "u", "\xC5\xB2" => "U", "\xC5\xB3" => "u", |
119 | "\xC5\xB4" => "W", "\xC5\xB5" => "w", "\xC5\xB6" => "Y", "\xC5\xB7" => "y", | 120 | "\xC5\xB4" => "W", "\xC5\xB5" => "w", "\xC5\xB6" => "Y", "\xC5\xB7" => "y", |
120 | "\xC5\xB8" => "Y", "\xC5\xB9" => "Z", "\xC5\xBA" => "z", "\xC5\xBB" => "Z", | 121 | "\xC5\xB8" => "Y", "\xC5\xB9" => "Z", "\xC5\xBA" => "z", "\xC5\xBB" => "Z", |
121 | "\xC5\xBC" => "z", "\xC5\xBD" => "Z", "\xC5\xBE" => "z", "\xC6\x8F" => "E", | 122 | "\xC5\xBC" => "z", "\xC5\xBD" => "Z", "\xC5\xBE" => "z", "\xC6\x8F" => "E", |
122 | "\xC6\xA0" => "O", "\xC6\xA1" => "o", "\xC6\xAF" => "U", "\xC6\xB0" => "u", | 123 | "\xC6\xA0" => "O", "\xC6\xA1" => "o", "\xC6\xAF" => "U", "\xC6\xB0" => "u", |
123 | "\xC7\x8D" => "A", "\xC7\x8E" => "a", "\xC7\x8F" => "I", | 124 | "\xC7\x8D" => "A", "\xC7\x8E" => "a", "\xC7\x8F" => "I", |
124 | "\xC7\x90" => "i", "\xC7\x91" => "O", "\xC7\x92" => "o", "\xC7\x93" => "U", | 125 | "\xC7\x90" => "i", "\xC7\x91" => "O", "\xC7\x92" => "o", "\xC7\x93" => "U", |
125 | "\xC7\x94" => "u", "\xC7\x95" => "U", "\xC7\x96" => "u", "\xC7\x97" => "U", | 126 | "\xC7\x94" => "u", "\xC7\x95" => "U", "\xC7\x96" => "u", "\xC7\x97" => "U", |
126 | "\xC7\x98" => "u", "\xC7\x99" => "U", "\xC7\x9A" => "u", "\xC7\x9B" => "U", | 127 | "\xC7\x98" => "u", "\xC7\x99" => "U", "\xC7\x9A" => "u", "\xC7\x9B" => "U", |
127 | "\xC7\x9C" => "u", | 128 | "\xC7\x9C" => "u", |
128 | "\xC7\xBA" => "A", "\xC7\xBB" => "a", "\xC7\xBC" => "AE","\xC7\xBD" => "ae", | 129 | "\xC7\xBA" => "A", "\xC7\xBB" => "a", "\xC7\xBC" => "AE","\xC7\xBD" => "ae", |
129 | "\xC7\xBE" => "O", "\xC7\xBF" => "o", | 130 | "\xC7\xBE" => "O", "\xC7\xBF" => "o", |
130 | "\xC9\x99" => "e", | 131 | "\xC9\x99" => "e", |
131 | "\xC2\x82" => ",", # High code comma | 132 | "\xC2\x82" => ",", # High code comma |
132 | "\xC2\x84" => ",,", # High code double comma | 133 | "\xC2\x84" => ",,", # High code double comma |
133 | "\xC2\x85" => "...", # Tripple dot | 134 | "\xC2\x85" => "...", # Tripple dot |
134 | "\xC2\x88" => "^", # High carat | 135 | "\xC2\x88" => "^", # High carat |
135 | "\xC2\x91" => "\x27", # Forward single quote | 136 | "\xC2\x91" => "\x27", # Forward single quote |
136 | "\xC2\x92" => "\x27", # Reverse single quote | 137 | "\xC2\x92" => "\x27", # Reverse single quote |
137 | "\xC2\x93" => "\x22", # Forward double quote | 138 | "\xC2\x93" => "\x22", # Forward double quote |
138 | "\xC2\x94" => "\x22", # Reverse double quote | 139 | "\xC2\x94" => "\x22", # Reverse double quote |
139 | "\xC2\x96" => "-", # High hyphen | 140 | "\xC2\x96" => "-", # High hyphen |
140 | "\xC2\x97" => "--", # Double hyphen | 141 | "\xC2\x97" => "--", # Double hyphen |
141 | "\xC2\xA6" => "|", # Split vertical bar | 142 | "\xC2\xA6" => "|", # Split vertical bar |
142 | "\xC2\xAB" => "<<", # Double less than | 143 | "\xC2\xAB" => "<<", # Double less than |
143 | "\xC2\xBB" => ">>", # Double greater than | 144 | "\xC2\xBB" => ">>", # Double greater than |
144 | "\xC2\xBC" => "1/4", # one quarter | 145 | "\xC2\xBC" => "1/4", # one quarter |
145 | "\xC2\xBD" => "1/2", # one half | 146 | "\xC2\xBD" => "1/2", # one half |
146 | "\xC2\xBE" => "3/4", # three quarters | 147 | "\xC2\xBE" => "3/4", # three quarters |
147 | "\xCA\xBF" => "\x27", # c-single quote | 148 | "\xCA\xBF" => "\x27", # c-single quote |
148 | "\xCC\xA8" => "", # modifier - under curve | 149 | "\xCC\xA8" => "", # modifier - under curve |
149 | "\xCC\xB1" => "", # modifier - under line | 150 | "\xCC\xB1" => "", # modifier - under line |
150 | # /\W/ => "" | 151 | # /\W/ => "" |
151 | } | 152 | } |
152 | 153 | ||
153 | end | 154 | end |
154 | 155 | ||
155 | # Extention of the standard class String with useful function. | 156 | # Extention of the standard class String with useful function. |
156 | class String | 157 | class String |
157 | include Mirimiri | 158 | include Mirimiri |
158 | 159 | ||
159 | def unaccent | 160 | def unaccent |
160 | # force_encoding is needed with ruby1.9 | 161 | # force_encoding is needed with ruby1.9 |
162 | # Transmap.inject(self) { |str, (utf8, asc)| str.gsub(utf8, asc) } | ||
161 | Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) } | 163 | Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) } |
162 | end | 164 | end |
163 | 165 | ||
164 | # Returns +true+ if +self+ belongs to Mirimiri::Stoplist, +false+ otherwise. | 166 | # Returns +true+ if +self+ belongs to Mirimiri::Stoplist, +false+ otherwise. |
165 | def is_stopword? | 167 | def is_stopword? |
166 | self.split.all? { |e| Stoplist.include?(e.downcase) } | 168 | self.split.all? { |e| Stoplist.include?(e.downcase) } |
167 | end | 169 | end |
168 | 170 | ||
169 | def sequential_dependence_model t=0.85,o=0.10,u=0.05,field=nil | 171 | def is_integer? |
172 | !self.empty? && self =~ /\A\d+\Z/ | ||
173 | end | ||
174 | |||
175 | def numeric? | ||
176 | Float(self) != nil rescue false | ||
177 | end | ||
178 | |||
179 | def sequential_dependence_model field=nil,t=0.85,o=0.10,u=0.05 | ||
170 | d = Mirimiri::Document.new self | 180 | d = Mirimiri::Document.new self |
171 | 181 | ||
172 | if field.nil? | 182 | if field.nil? |
173 | ematch = d.ngrams(2).collect { |ng| "#1(#{ng})" } | 183 | ematch = d.ngrams(2).collect { |ng| "#1(#{ng})" } |
174 | pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng})" } | 184 | pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng})" } |
175 | else | 185 | else |
176 | ematch = d.ngrams(2).collect { |ng| "#1(#{ng}).(#{field})" } | 186 | ematch = d.ngrams(2).collect { |ng| "#1(#{ng}).(#{field})" } |
177 | pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng}).(#{field})" } | 187 | pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng}).(#{field})" } |
178 | end | 188 | end |
179 | 189 | ||
180 | if ematch.empty? | 190 | if ematch.empty? |
181 | if field.nil? | 191 | if field.nil? |
182 | ematch = d.words.collect { |ng| "#1(#{ng})" } | 192 | ematch = d.words.collect { |ng| "#1(#{ng})" } |
183 | pmatch = d.words.collect { |ng| "#uw8(#{ng})" } | 193 | pmatch = d.words.collect { |ng| "#uw8(#{ng})" } |
184 | else | 194 | else |
185 | ematch = d.words.collect { |ng| "#1(#{ng}).(#{field})" } | 195 | ematch = d.words.collect { |ng| "#1(#{ng}).(#{field})" } |
186 | pmatch = d.words.collect { |ng| "#uw8(#{ng}).(#{field})" } | 196 | pmatch = d.words.collect { |ng| "#uw8(#{ng}).(#{field})" } |
187 | end | 197 | end |
188 | end | 198 | end |
189 | 199 | ||
190 | "#weight ( #{t} #combine( #{d.words.join(" ")} ) #{o} #combine ( #{ematch.join(" ")} ) #{u} #combine ( #{pmatch.join(" ")} ) )" | 200 | "#weight ( #{t} #combine( #{d.words.join(" ")} ) #{o} #combine ( #{ematch.join(" ")} ) #{u} #combine ( #{pmatch.join(" ")} ) )" |
191 | end | 201 | end |
192 | 202 | ||
193 | # Do not use. | 203 | # Do not use. |
194 | # TODO: rewamp. find why this function is here. | 204 | # TODO: rewamp. find why this function is here. |
195 | def remove_special_characters | 205 | def remove_special_characters |
196 | self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ') | 206 | self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ') |
197 | end | 207 | end |
198 | 208 | ||
199 | # Removes all XML-like tags from +self+. | 209 | # Removes all XML-like tags from +self+. |
200 | # | 210 | # |
201 | # s = "<html><body>test</body></html>" | 211 | # s = "<html><body>test</body></html>" |
202 | # s.strip_xml_tags! | 212 | # s.strip_xml_tags! |
203 | # s #=> "test" | 213 | # s #=> "test" |
204 | def strip_xml_tags! | 214 | def strip_xml_tags! |
205 | replace strip_with_pattern /<\/?[^>]*>/ | 215 | replace strip_with_pattern /<\/?[^>]*>/ |
206 | end | 216 | end |
207 | 217 | ||
208 | # Removes all XML-like tags from +self+. | 218 | # Removes all XML-like tags from +self+. |
209 | # | 219 | # |
210 | # s = "<html><body>test</body></html>" | 220 | # s = "<html><body>test</body></html>" |
211 | # s.strip_xml_tags #=> "test" | 221 | # s.strip_xml_tags #=> "test" |
212 | # s #=> "<html><body>test</body></html>" | 222 | # s #=> "<html><body>test</body></html>" |
213 | def strip_xml_tags | 223 | def strip_xml_tags |
214 | dup.strip_xml_tags! | 224 | dup.strip_xml_tags! |
215 | end | 225 | end |
216 | 226 | ||
217 | # Removes all Javascript sources from +self+. | 227 | # Removes all Javascript sources from +self+. |
218 | # | 228 | # |
219 | # s = "<script type='text/javascript'> | 229 | # s = "<script type='text/javascript'> |
220 | # var skin='vector', | 230 | # var skin='vector', |
221 | # stylepath='http://bits.wikimedia.org/skins-1.5' | 231 | # stylepath='http://bits.wikimedia.org/skins-1.5' |
222 | # </script> | 232 | # </script> |
223 | # | 233 | # |
224 | # test" | 234 | # test" |
225 | # s.strip_javascripts! | 235 | # s.strip_javascripts! |
226 | # s #=> "test" | 236 | # s #=> "test" |
227 | def strip_javascripts! | 237 | def strip_javascripts! |
228 | replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m | 238 | replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m |
229 | end | 239 | end |
230 | 240 | ||
231 | # Removes all Javascript sources from +self+. | 241 | # Removes all Javascript sources from +self+. |
232 | # | 242 | # |
233 | # s = "<script type='text/javascript'> | 243 | # s = "<script type='text/javascript'> |
234 | # var skin='vector', | 244 | # var skin='vector', |
235 | # stylepath='http://bits.wikimedia.org/skins-1.5' | 245 | # stylepath='http://bits.wikimedia.org/skins-1.5' |
236 | # </script> | 246 | # </script> |
237 | # | 247 | # |
238 | # test" | 248 | # test" |
239 | # s.strip_javascripts #=> "test" | 249 | # s.strip_javascripts #=> "test" |
240 | def strip_javascripts | 250 | def strip_javascripts |
241 | dup.strip_javascripts! | 251 | dup.strip_javascripts! |
242 | end | 252 | end |
243 | 253 | ||
244 | def strip_stylesheets! | 254 | def strip_stylesheets! |
245 | # TODO: rewamp. dunno what is it. | 255 | # TODO: rewamp. dunno what is it. |
246 | replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m | 256 | replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m |
247 | end | 257 | end |
248 | 258 | ||
249 | def strip_stylesheets | 259 | def strip_stylesheets |
250 | dup.strip_stylesheets! | 260 | dup.strip_stylesheets! |
251 | end | 261 | end |
252 | 262 | ||
253 | # Removes punctuation from +self+. | 263 | # Removes punctuation from +self+. |
254 | # | 264 | # |
255 | # s = "hello, world. how are you?!" | 265 | # s = "hello, world. how are you?!" |
256 | # s.strip_punctuation! | 266 | # s.strip_punctuation! |
257 | # s # => "hello world how are you" | 267 | # s # => "hello world how are you" |
258 | def strip_punctuation! | 268 | def strip_punctuation! |
259 | replace strip_with_pattern /[^a-zA-Z0-9\-\s]/ | 269 | replace strip_with_pattern /[^a-zA-Z0-9\-\s]/ |
260 | end | 270 | end |
261 | 271 | ||
262 | # Removes punctuation from +self+. | 272 | # Removes punctuation from +self+. |
263 | # | 273 | # |
264 | # s = "hello, world. how are you?!" | 274 | # s = "hello, world. how are you?!" |
265 | # s.strip_punctuation # => "hello world how are you" | 275 | # s.strip_punctuation # => "hello world how are you" |
266 | def strip_punctuation | 276 | def strip_punctuation |
267 | dup.strip_punctuation! | 277 | dup.strip_punctuation! |
268 | end | 278 | end |
269 | 279 | ||
270 | # Returns the text values inside all occurences of a XML tag in +self+ | 280 | # Returns the text values inside all occurences of a XML tag in +self+ |
271 | # | 281 | # |
272 | # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre" | 282 | # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre" |
273 | # s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"] | 283 | # s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"] |
274 | def extract_xmltags_values(tag_name) | 284 | def extract_xmltags_values(tag_name) |
275 | self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten | 285 | self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten |
276 | end | 286 | end |
277 | 287 | ||
278 | def strip_with_pattern(pattern) | 288 | def strip_with_pattern(pattern) |
279 | require 'cgi' | 289 | require 'cgi' |
280 | 290 | ||
281 | CGI::unescapeHTML(self.gsub(pattern,"")).unaccent.encode("UTF-8", {:invalid => :replace, :undef => :replace, :replace => " "}) | 291 | CGI::unescapeHTML(self.gsub(pattern,"")).unaccent.encode("UTF-8", {:invalid => :replace, :undef => :replace, :replace => " "}) |
282 | end | 292 | end |
283 | 293 | ||
284 | private :strip_with_pattern | 294 | private :strip_with_pattern |
285 | end | 295 | end |
286 | 296 | ||
287 | module Indri | 297 | module Indri |
288 | class IndriPrintedDocuments < String | 298 | class IndriPrintedDocuments < String |
289 | 299 | ||
290 | def extract_docs | 300 | def extract_docs |
291 | self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? } | 301 | self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? } |
302 | end | ||
303 | |||
304 | def extract_docs_score | ||
305 | score = self.scan(/\d+ Q0 .+ \d+ (-\d+.\d+) .+/).flatten | ||
306 | name = self.scan(/\d+ Q0 (.+) \d+ -\d+.\d+ .+/).collect { |n| n.first.scan(/(\d+).xml/).first } | ||
307 | return self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? },score,name | ||
292 | end | 308 | end |
293 | end | 309 | end |
294 | end | 310 | end |
295 | 311 |
main.rb
1 | $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib")) | 1 | $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib")) |
2 | 2 | ||
3 | require 'mirimiri' | 3 | require 'mirimiri' |
4 | require "benchmark" | 4 | require "benchmark" |
5 | 5 | ||
6 | # Fetch the text content of two Wikipedia pages using their URLs | ||
6 | w = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan") | 7 | w = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan") |
8 | u = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/Pantera") | ||
9 | |||
10 | # Compute the entropy of a word sequence, using `w` as context | ||
7 | p w.entropy("dillinger escape plan") | 11 | p w.entropy("dillinger escape plan") |
8 | p w.tf("guitar") | 12 | p w.tf("guitar") |
9 | 13 | ||
14 | # Compute the KL-Divergence between the two pages | ||
15 | p w.kl u | ||
16 | |||
17 | |||
18 | # Mirimiri also comprises Indri-related classes | ||
19 | |||
20 | # Building an Indri query | ||
10 | query = Indri::IndriQuery.new({:query => "dillinger escape plan".sequential_dependence_model, :count => 10}, "-trecFormat=true -printDocuments=true") | 21 | query = Indri::IndriQuery.new({:query => "dillinger escape plan".sequential_dependence_model, :count => 10}, "-trecFormat=true -printDocuments=true") |
22 | |||
23 | # Initializing the index on which the query will be executed | ||
24 | # Must have been previously built using `IndriBuildIndex` | ||
11 | index = Indri::IndriIndex.new "/mnt/disk1/ClueWeb09_English_1noSpam" | 25 | index = Indri::IndriIndex.new "/mnt/disk1/ClueWeb09_English_1noSpam" |
26 | |||
27 | # Run the query on the index and fetch the text of the documents | ||
12 | s = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8")) | 28 | s = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8")) |
13 | 29 |