Commit b0ffa2ad49638e2a223fff528de1a4ad336acb72

Authored by Romain Deveaud
1 parent b3c0213975
Exists in master

finally committing some recent changes

Showing 7 changed files with 146 additions and 16 deletions Inline Diff

1 # mirimiri 1 # mirimiri
2 2
3 Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> 3 The various tools of this project were developed for research purposes during
4 my Ph.D. and heavily rely on the use of Indri (<http://lemurproject.org/indri.php>).
5 Setting up Ruby is not as painful as it used to be since RVM (<https://rvm.io/>),
6 visit at least these two websites before trying to use `mirimiri`.
7
8
9 Copyright (C) 2010-2013 Romain Deveaud <romain.deveaud@gmail.com>
4 10
5 > The Fijian monkey-faced bat (Mirimiri acrodonta), also called the Fiji 11 > The Fijian monkey-faced bat (Mirimiri acrodonta), also called the Fiji
6 > Flying Fox, is an Old World fruit bat endemic to Fiji. It was discovered 12 > Flying Fox, is an Old World fruit bat endemic to Fiji. It was discovered
7 > the hills of Taveuni by Bill Beckon in 1977 and is Fiji's only endemic 13 > the hills of Taveuni by Bill Beckon in 1977 and is Fiji's only endemic
8 > mammal. It is listed as a critically endangered species due to habitat 14 > mammal. It is listed as a critically endangered species due to habitat
9 > loss. It has recently been transferred from Pteralopex to its own 15 > loss. It has recently been transferred from Pteralopex to its own
10 > monotypic genus Mirimiri. 16 > monotypic genus Mirimiri.
11 > 17 >
12 > #####Wikipedia 18 > #####Wikipedia
13 19
14 License 20 License
15 ======= 21 =======
16 22
17 This program is free software: you can redistribute it and/or modify 23 This program is free software: you can redistribute it and/or modify
18 it under the terms of the GNU General Public License as published by 24 it under the terms of the GNU General Public License as published by
19 the Free Software Foundation, either version 3 of the License, or 25 the Free Software Foundation, either version 3 of the License, or
20 (at your option) any later version. 26 (at your option) any later version.
21 27
22 This program is distributed in the hope that it will be useful, 28 This program is distributed in the hope that it will be useful,
23 but WITHOUT ANY WARRANTY; without even the implied warranty of 29 but WITHOUT ANY WARRANTY; without even the implied warranty of
24 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 30 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 GNU General Public License for more details. 31 GNU General Public License for more details.
26 32
27 You should have received a copy of the GNU General Public License 33 You should have received a copy of the GNU General Public License
28 along with this program. If not, see <http://www.gnu.org/licenses/>. 34 along with this program. If not, see <http://www.gnu.org/licenses/>.
29 35
lib/mirimiri/document.rb
1 #!/usr/bin/env ruby 1 #!/usr/bin/env ruby
2 2
3 #-- 3 #--
4 # This file is a part of the mirimiri library 4 # This file is a part of the mirimiri library
5 # 5 #
6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> 6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7 # 7 #
8 # This program is free software: you can redistribute it and/or modify 8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by 9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or 10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version. 11 # (at your option) any later version.
12 # 12 #
13 # This program is distributed in the hope that it will be useful, 13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details. 16 # GNU General Public License for more details.
17 # 17 #
18 # You should have received a copy of the GNU General Public License 18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>. 19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #++ 20 #++
21 21
22 22
23 # General module 23 # General module
24 module Mirimiri 24 module Mirimiri
25 25
26 # A Document is a bag of words and is constructed from a string. 26 # A Document is a bag of words and is constructed from a string.
27 class Document 27 class Document
28 attr_reader :words, :doc_content, :count_words 28 attr_reader :words, :doc_content, :xcount
29 29
30 # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html 30 # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
31 # and the \\W special escape). 31 # and the \\W special escape).
32 # 32 #
33 # Protected function, only meant to by called at the initialization. 33 # Protected function, only meant to by called at the initialization.
34 def format_words 34 def format_words
35 wo = [] 35 wo = []
36 36
37 @doc_content.split.each do |w| 37 @doc_content.split.each do |w|
38 w.split(/\W/).each do |sw| 38 w.split(/\W/).each do |sw|
39 wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ 39 wo.push(sw.downcase) if sw =~ /[[:alpha:]]/
40 end 40 end
41 end 41 end
42 42
43 wo 43 wo
44 end 44 end
45 45
46 # Returns an Array containing the +n+-grams (words) from the current Document. 46 # Returns an Array containing the +n+-grams (words) from the current Document.
47 # 47 #
48 # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] 48 # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
49 def ngrams(n) 49 def ngrams(n)
50 window = [] 50 window = []
51 ngrams_array = [] 51 ngrams_array = []
52 52
53 if @ngrams[n].nil? 53 if @ngrams[n].nil?
54 @words.each do |w| 54 @words.each do |w|
55 window.push(w) 55 window.push(w)
56 if window.size == n 56 if window.size == n
57 ngrams_array.push window.join(" ") 57 ngrams_array.push window.join(" ")
58 window.delete_at(0) 58 window.delete_at(0)
59 end 59 end
60 end 60 end
61 @ngrams[n] = ngrams_array 61 @ngrams[n] = ngrams_array
62 end 62 end
63 63
64 @ngrams[n] 64 @ngrams[n]
65 end 65 end
66 66
67 # Returns a Hash containing the words and their associated counts in the current Document. 67 # Returns a Hash containing the words and their associated counts in the current Document.
68 # 68 #
69 # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } 69 # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
70 def count_words 70 def count_words
71 counts = Hash.new { |h,k| h[k] = 0 } 71 counts = Hash.new { |h,k| h[k] = 0 }
72 @words.each { |w| counts[w] += 1 } 72 @words.each { |w| counts[w] += 1 }
73 73
74 counts 74 counts
75 end 75 end
76 76
77 # Old entropy function. 77 # Old entropy function.
78 # TODO: remove. 78 # TODO: remove.
79 def entropy0(s) 79 def entropy0(s)
80 en = 0.0 80 en = 0.0
81 81
82 s.split.each do |w| 82 s.split.each do |w|
83 p_wi = @count_words[w].to_f/@words.count.to_f 83 p_wi = @xcount[w].to_f/@words.count.to_f
84 en += p_wi*Math.log2(p_wi) 84 en += p_wi*Math.log2(p_wi)
85 end 85 end
86 86
87 en *= -1 87 en *= -1
88 en 88 en
89 end 89 end
90 90
91 # Computes the entropy of a given string +s+ inside the document. 91 # Computes the entropy of a given string +s+ inside the document.
92 # 92 #
93 # If the string parameter is composed of many words (i.e. tokens separated 93 # If the string parameter is composed of many words (i.e. tokens separated
94 # by whitespace(s)), it is considered as an ngram. 94 # by whitespace(s)), it is considered as an ngram.
95 # 95 #
96 # entropy("guitar") #=> 0.014348983965324762 96 # entropy("guitar") #=> 0.014348983965324762
97 # entropy("dillinger escape plan") #=> 0.054976093116768154 97 # entropy("dillinger escape plan") #=> 0.054976093116768154
98 def entropy(s) 98 def entropy(s)
99 en = 0.0 99 en = 0.0
100 100
101 size = s.split.size 101 size = s.split.size
102 102
103 if size == 1 103 if size == 1
104 p_wi = @count_words[s].to_f/@words.count.to_f 104 p_wi = @xcount[s].to_f/@words.count.to_f
105 en += p_wi*Math.log(p_wi) 105 en += p_wi*Math.log(p_wi)
106 elsif size > 1 106 elsif size > 1
107 ng_size = ngrams(size) 107 ng_size = ngrams(size)
108 p_wi = ng_size.count(s).to_f/ng_size.count.to_f 108 p_wi = ng_size.count(s).to_f/ng_size.count.to_f
109 en += p_wi*Math.log(p_wi) 109 en += p_wi*Math.log(p_wi)
110 end 110 end
111 111
112 en *= -1 112 en *= -1
113 en 113 en
114 end 114 end
115 115
116 # Computes the term frequency of a given *word* +s+. 116 # Computes the term frequency of a given *word* +s+.
117 # 117 #
118 # tf("guitar") #=> 0.000380372765310004 118 # tf("guitar") #=> 0.000380372765310004
119 def tf(s) 119 def tf(s)
120 @count_words[s].to_f/@words.size.to_f 120 @xcount[s].to_f/@words.size.to_f
121 end 121 end
122 122
123 # Computes the KL divergence between the language model of the +self+
124 # and the language model of +doc+.
125 #
126 # KL is not symmetric, see http://en.wikipedia.org/wiki/Kullback-Leibler_divergence
127 # for more information.
128 #
129 # d1.kl(d2) #=> 0.2971808085725761
130 def kl(doc)
131 raise ArgumentError, 'Argument is not a Mirimiri::Document' unless doc.is_a? Mirimiri::Document
132
133 vocab = self.words & doc.words
123 134
135 vocab.inject(0.0) { |res,w| res + self.tf(w)*Math.log(self.tf(w)/doc.tf(w)) }
136 end
137
124 def initialize(content="") 138 def initialize(content="")
125 @doc_content = content 139 @doc_content = content
126 @words = format_words 140 @words = format_words
127 @count_words = count_words 141 @xcount = count_words
128 @ngrams = {} 142 @ngrams = {}
129 end 143 end
130 144
131 protected :format_words, :count_words 145 protected :format_words, :count_words
132 end 146 end
133 147
134 # A WebDocument is a Document with a +url+. 148 # A WebDocument is a Document with a +url+.
135 class WebDocument < Document 149 class WebDocument < Document
136 attr_reader :url 150 attr_reader :url
137 151
138 # Returns the HTML text from the page of a given +url+. 152 # Returns the HTML text from the page of a given +url+.
139 def self.get_content(url) 153 def self.get_content(url)
140 require 'net/http' 154 require 'net/http'
141 Net::HTTP.get(URI.parse(url)) 155 Net::HTTP.get(URI.parse(url))
142 end 156 end
143 157
144 158
145 # WebDocument constructor, the content of the Document is the HTML page 159 # WebDocument constructor, the content of the Document is the HTML page
146 # without the tags. 160 # without the tags.
147 def initialize(url,only_tags=nil) 161 def initialize(url,only_tags=nil)
148 require 'sanitize' 162 require 'sanitize'
149 163
150 @url = url 164 @url = url
151 content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("") 165 content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")
152 super Sanitize.clean(content.unaccent.toutf8.force_encoding("UTF-8"), :remove_contents => ['script']) 166 super Sanitize.clean(content, :remove_contents => ['script','style'])
153 end 167 end
154 end 168 end
155 169
156 # A WikipediaPage is a WebDocument. 170 # A WikipediaPage is a WebDocument.
157 class WikipediaPage < WebDocument 171 class WikipediaPage < WebDocument
158 require 'rexml/document' 172 require 'rexml/document'
159 require 'net/http' 173 require 'net/http'
160 require 'kconv' 174 require 'kconv'
161 175
162 176
163 def self.search_wikipedia_titles(name) 177 def self.search_wikipedia_titles(name)
164 raise ArgumentError, "Bad encoding", name unless name.isutf8 178 # raise ArgumentError, "Bad encoding", name unless name.isutf8
165 179
166 res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).unaccent.toutf8).elements['api/query/search'] 180 res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&srlimit=20&format=xml" ).force_encoding("ISO-8859-1").encode("UTF-8")).elements['api/query/search']
167 181
168 res.collect { |e| e.attributes['title'] } unless res.nil? 182 res.collect { |e| e.attributes['title'] } unless res.nil?
169 end 183 end
170 184
171 def self.get_url(name) 185 def self.get_url(name)
172 raise ArgumentError, "Bad encoding", name unless name.isutf8 186 raise ArgumentError, "Bad encoding", name unless name.isutf8
173 187
174 atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).unaccent.toutf8).elements['api/query/pages/page'].attributes 188 atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).unaccent.toutf8).elements['api/query/pages/page'].attributes
175 189
176 atts['fullurl'] if atts['missing'].nil? 190 atts['fullurl'] if atts['missing'].nil?
177 end 191 end
178 192
179 def self.search_homepage(name) 193 def self.search_homepage(name)
180 title = WikipediaPage.search_wikipedia_titles name 194 title = WikipediaPage.search_wikipedia_titles name
181 195
182 WikipediaPage.get_url(title[0]) unless title.nil? || title.empty? 196 WikipediaPage.get_url(title[0]) unless title.nil? || title.empty?
183 end 197 end
184 198
185 def self.extract_anchors(url) 199 def self.extract_anchors(url)
186 self.get_content(url).extract_xmltags_values('p').join(' ').scan(/<a href="(.+?)" title=.*?>(.+?)<\/a>/).delete_if { |a| a[0] =~ /^\/wiki\/.*$/.negated } 200 self.get_content(url).extract_xmltags_values('p').join(' ').scan(/<a href="(.+?)" title=.*?>(.+?)<\/a>/).delete_if { |a| a[0] =~ /^\/wiki\/.*$/.negated }
187 end 201 end
188 end 202 end
189 203
190 class FreebasePage < WebDocument 204 class FreebasePage < WebDocument
191 require 'net/http' 205 require 'net/http'
192 require 'kconv' 206 require 'kconv'
193 require 'json' 207 require 'json'
194 208
195 def self.search_article_ids query,limit 209 def self.search_article_ids query,limit
196 raise ArgumentError, "Bad encoding", name unless name.isutf8 210 raise ArgumentError, "Bad encoding", name unless name.isutf8
197 211
198 JSON.parse(Net::HTTP.get( URI.parse "http://api.freebase.com/api/service/search?query=#{query.gsub(" ","+")}&limit=#{limit}" ))['result'].collect { |a| a['article']['id'] unless a['article'].nil? }.compact 212 JSON.parse(Net::HTTP.get( URI.parse "http://api.freebase.com/api/service/search?query=#{query.gsub(" ","+")}&limit=#{limit}" ))['result'].collect { |a| a['article']['id'] unless a['article'].nil? }.compact
199 end 213 end
200 214
201 def self.get_url id 215 def self.get_url id
202 "http://api.freebase.com/api/trans/raw#{id}" 216 "http://api.freebase.com/api/trans/raw#{id}"
203 end 217 end
204 end 218 end
205 end 219 end
206 220
lib/mirimiri/index.rb
1 #!/usr/bin/env ruby 1 #!/usr/bin/env ruby
2 2
3 #-- 3 #--
4 # This file is a part of the mirimiri library 4 # This file is a part of the mirimiri library
5 # 5 #
6 # Copyright (C) 2010-2012 Romain Deveaud <romain.deveaud@gmail.com> 6 # Copyright (C) 2010-2012 Romain Deveaud <romain.deveaud@gmail.com>
7 # 7 #
8 # This program is free software: you can redistribute it and/or modify 8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by 9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or 10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version. 11 # (at your option) any later version.
12 # 12 #
13 # This program is distributed in the hope that it will be useful, 13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details. 16 # GNU General Public License for more details.
17 # 17 #
18 # You should have received a copy of the GNU General Public License 18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>. 19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #++ 20 #++
21 21
22 class Index 22 class Index
23 end 23 end
24 24
25 module Indri 25 module Indri
26 26
27 class IndriIndex < Index 27 class IndriIndex < Index
28 28
29 def initialize path 29 def initialize path
30 raise ArgumentError, 'Index path does not exist' unless File.directory? path 30 raise ArgumentError, 'Index path does not exist' unless File.directory? path
31 @path = path 31 @path = path
32 end 32 end
33 33
34 def runquery indriquery 34 def runquery indriquery
35 raise ArgumentError, 'Argument is not an IndriQuery' unless indriquery.is_a? Indri::IndriQuery 35 raise ArgumentError, 'Argument is not an Indri::IndriQuery' unless indriquery.is_a? Indri::IndriQuery
36 36
37 query = "IndriRunQuery -query=\"#{indriquery.query}\" -index=#{@path}" 37 query = "IndriRunQuery -query=\"#{indriquery.query}\" -index=#{@path}"
38 38
39 query += " -count=#{indriquery.count}" unless indriquery.count.nil? 39 query += " -count=#{indriquery.count}" unless indriquery.count.nil?
40 query += " -rule=method:#{indriquery.sm_method},#{indriquery.sm_param}:#{indriquery.sm_value}" unless indriquery.sm_method.nil? 40 query += " -rule=method:#{indriquery.sm_method},#{indriquery.sm_param}:#{indriquery.sm_value}" unless indriquery.sm_method.nil?
41 query += " #{indriquery.args}" unless indriquery.args.nil? 41 query += " #{indriquery.args}" unless indriquery.args.nil?
42 42
43 res = `#{query}` 43 res = `#{query}`
44 44
45 res 45 res
46 end 46 end
47 end 47 end
48 end 48 end
49 49
lib/mirimiri/query.rb
1 #!/usr/bin/env ruby 1 #!/usr/bin/env ruby
2 2
3 #-- 3 #--
4 # This file is a part of the mirimiri library 4 # This file is a part of the mirimiri library
5 # 5 #
6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> 6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7 # 7 #
8 # This program is free software: you can redistribute it and/or modify 8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by 9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or 10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version. 11 # (at your option) any later version.
12 # 12 #
13 # This program is distributed in the hope that it will be useful, 13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details. 16 # GNU General Public License for more details.
17 # 17 #
18 # You should have received a copy of the GNU General Public License 18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>. 19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #++ 20 #++
21 21
22 class Query 22 class Query
23 attr_accessor :query
24
25
23 end 26 end
24 27
25 module Indri 28 module Indri
26 29
27 class Parameters 30 class Parameters
28 attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline 31 attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline
29 32
30 def initialize(corpus,count="1000",mem="1g",threads="1",offset="1",run_id="default",print_query=false,print_docs=false) 33 def initialize(corpus,count="1000",mem="1g",threads="1",offset="1",run_id="default",print_passages=false,print_query=false,print_docs=false)
31 @index_path = corpus 34 @index_path = corpus
32 @memory = mem 35 @memory = mem
33 @count = count 36 @count = count
34 @threads = threads 37 @threads = threads
35 @offset = offset 38 @offset = offset
36 @run_id = run_id 39 @run_id = run_id
37 @print_query = print_query ? "true" : "false" 40 @print_query = print_query ? "true" : "false"
38 @print_docs = print_docs ? "true" : "false" 41 @print_docs = print_docs ? "true" : "false"
42 @print_passages = print_passages ? "true" : "false"
43 @indexes = [corpus]
39 end 44 end
40 45
41 def to_s 46 def to_s
42 h = "<memory>#{@memory}</memory>\n" 47 h = "<memory>#{@memory}</memory>\n"
43 h += "<index>#{@index_path}</index>\n" 48 @indexes.each do |i|
49 h += "<index>#{i}</index>\n"
50 end
44 h += "<count>#{@count}</count>\n" 51 h += "<count>#{@count}</count>\n"
45 h += "<threads>#{@threads}</threads>\n" 52 h += "<threads>#{@threads}</threads>\n"
46 unless @baseline.nil? 53 unless @baseline.nil?
47 h += "<baseline>#{@baseline}</baseline>\n" 54 h += "<baseline>#{@baseline}</baseline>\n"
48 else 55 else
49 h += "<rule>#{@rule}</rule>\n" 56 h += "<rule>#{@rule}</rule>\n"
50 end 57 end
51 h += "<trecFormat>true</trecFormat>\n" 58 h += "<trecFormat>true</trecFormat>\n"
52 h += "<queryOffset>#{@offset}</queryOffset>\n" 59 h += "<queryOffset>#{@offset}</queryOffset>\n"
53 h += "<runID>#{@run_id}</runID>\n" 60 h += "<runID>#{@run_id}</runID>\n"
61 h += "<printPassages>#{@print_passages}</printPassages>\n"
54 h += "<printQuery>#{@print_query}</printQuery>\n" 62 h += "<printQuery>#{@print_query}</printQuery>\n"
55 h += "<printDocuments>#{@print_docs}</printDocuments>\n" 63 h += "<printDocuments>#{@print_docs}</printDocuments>\n"
56 64
57 h 65 h
58 end 66 end
67
68 def add_index path
69 @indexes << path
70 end
59 end 71 end
60 72
61 class IndriQueryOld < Query 73 class IndriQueryOld < Query
62 attr_accessor :id, :query, :rule 74 attr_accessor :id, :query, :rule
63 75
64 def initialize(id,query) 76 def initialize(id,query)
65 @id = id 77 @id = id
66 @query = query 78 @query = query
67 end 79 end
68 80
69 def to_s 81 def to_s
70 h = "<query>\n" 82 h = "<query>\n"
71 h += "<number>#{@id}</number>\n" 83 h += "<number>#{@id}</number>\n"
72 h += "<text>#{@query}</text>\n" 84 h += "<text>#{@query}</text>\n"
73 h += "</query>\n" 85 h += "</query>\n"
74 86
75 h 87 h
76 end 88 end
77 89
78 def exec params 90 def exec params
79 `IndriRunQuery -query='#{@query}' -index=#{params.index_path} -count=#{params.count} -rule=method:dirichlet,mu:2500 -trecFormat` 91 `IndriRunQuery -query='#{@query}' -index=#{params.index_path} -count=#{params.count} -rule=method:dirichlet,mu:2500 -trecFormat`
80 end 92 end
81 end 93 end
82 94
83 class IndriQuery < Query 95 class IndriQuery < Query
84 attr_accessor :query, :count, :sm_method, :sm_param, :sm_value, :args 96 attr_accessor :query, :count, :sm_method, :sm_param, :sm_value, :args
85 97
86 def initialize atts={},args=nil 98 def initialize atts={},args=nil
87 raise ArgumentError, 'Argument 1 must be a Hash' unless atts.is_a? Hash 99 raise ArgumentError, 'Argument 1 must be a Hash' unless atts.is_a? Hash
88 atts.each do |k,v| 100 atts.each do |k,v|
89 instance_variable_set("@#{k}", v) unless v.nil? 101 instance_variable_set("@#{k}", v) unless v.nil?
90 end 102 end
91 103
92 raise ArgumentError, 'Argument 2 must be a String' unless (args.is_a?(String) || args.nil?) 104 raise ArgumentError, 'Argument 2 must be a String' unless (args.is_a?(String) || args.nil?)
93 @args = args 105 @args = args
106 end
107
108 def clarity index_path,terms=10,documents=5
109 `clarity -index=#{index_path} -documents=#{documents} -terms=#{terms} -smoothing=\"method:#{@sm_method},#{@sm_param}:#{@sm_value}\" -query=\"#{query}\"`.split("=").last.strip
94 end 110 end
95 end 111 end
96 112
97 class IndriQueries 113 class IndriQueries
98 attr_accessor :params, :queries 114 attr_accessor :params, :queries
99 115
100 def initialize params 116 def initialize params
101 # @queries = queries 117 # @queries = queries
102 118
103 @params = params 119 @params = params
104 @queries = {} 120 @queries = {}
105 # Here we set the default retrieval model as Language Modeling 121 # Here we set the default retrieval model as Language Modeling
106 # with a Dirichlet smoothing at 2500. 122 # with a Dirichlet smoothing at 2500.
107 # TODO: maybe a Rule class... 123 # TODO: maybe a Rule class...
108 @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? 124 @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil?
109 end 125 end
110 126
111 def push id,query 127 def push id,query
112 @queries[id.to_i] = query 128 @queries[id.to_i] = query
113 end 129 end
114 130
115 def to_s 131 def to_s
116 h = "<parameters>\n" 132 h = "<parameters>\n"
117 h += @params.to_s 133 h += @params.to_s
118 h += @queries.sort { |a,b| a[0] <=> b[0] }.collect do |q| 134 h += @queries.sort { |a,b| a[0] <=> b[0] }.collect do |q|
119 "<query>\n" + 135 "<query>\n" +
120 "<number>#{q[0]}</number>\n" + 136 "<number>#{q[0]}</number>\n" +
121 "<text>#{q[1]}</text>\n" + 137 "<text>#{q[1]}</text>\n" +
122 "</query>\n" 138 "</query>\n"
123 end.join "" 139 end.join ""
124 # h += @queries.collect { |q| q.to_s }.join "" 140 # h += @queries.collect { |q| q.to_s }.join ""
125 h += "</parameters>" 141 h += "</parameters>"
126 142
127 h 143 h
128 end 144 end
129 end 145 end
130 146
131 end 147 end
132 148
lib/mirimiri/result.rb
File was created 1 #!/usr/bin/env ruby
2
3 #--
4 # This file is a part of the mirimiri library
5 #
6 # Copyright (C) 2010-2012 Romain Deveaud <romain.deveaud@gmail.com>
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #++
21
22 module Mirimiri
23
24 # This class represents one line of a TREC-formatted retrieval
25 # result. Typical output of Indri or Terrier.
26 class TrecResult
27 attr_accessor :topic, :doc, :rank, :score, :run
28
29 def initialize arg
30 t = arg.split
31 @topic = t[0]
32 @doc = t[2]
33 @rank = t[3]
34 @score = t[4]
35 @run = t[5]
36 end
37 end
38
39 # This class represents the output of trec_eval, when
40 # -q option is given.
41 class TrecEval
42 attr_accessor :metric, :run, :queries
43
44 def initialize arg
45 @queries = {}
46
47 arg.each_line do |line|
48 t = line.split
49 @metric = t[0] if @metric.nil?
50 @queries[t[1]] = t[2].to_f if t[1].is_integer?
51 end
52 end
53 end
54
55 # An array of TrecResult, or a run.
56 class TrecResults < Array
57
58 def initialize args
59 super args.collect { |res| TrecResult.new res }
60 end
61 end
62 end
63
lib/mirimiri/string.rb
1 #!/usr/bin/env ruby 1 #!/usr/bin/env ruby
2 2
3 #-- 3 #--
4 # This file is a part of the mirimiri library 4 # This file is a part of the mirimiri library
5 # 5 #
6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> 6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7 # 7 #
8 # This program is free software: you can redistribute it and/or modify 8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by 9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or 10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version. 11 # (at your option) any later version.
12 # 12 #
13 # This program is distributed in the hope that it will be useful, 13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details. 16 # GNU General Public License for more details.
17 # 17 #
18 # You should have received a copy of the GNU General Public License 18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>. 19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #++ 20 #++
21 21
22 module Mirimiri 22 module Mirimiri
23 23
24 # These are the default stopwords provided by Lemur. 24 # These are the default stopwords provided by Lemur.
25 Stoplist = [ 25 Stoplist = [
26 "a","about","above","according","across","after","afterwards","again","against", 26 "a","about","above","according","across","after","afterwards","again","against",
27 "albeit","all","almost","alone","along","already","also","although","always","am", 27 "albeit","all","almost","alone","along","already","also","although","always","am",
28 "among","amongst","an","and","another","any","anybody","anyhow","anyone","anything", 28 "among","amongst","an","and","another","any","anybody","anyhow","anyone","anything",
29 "anyway","anywhere","apart","are","around","as","at","av","be","became","because", 29 "anyway","anywhere","apart","are","around","as","at","av","be","became","because",
30 "become","becomes","becoming","been","before","beforehand","behind","being","below", 30 "become","becomes","becoming","been","before","beforehand","behind","being","below",
31 "beside","besides","between","beyond","both","but","by","can","cannot","canst", 31 "beside","besides","between","beyond","both","but","by","can","cannot","canst",
32 "certain","cf","choose","contrariwise","cos","could","cu","day","do","does","doesn't", 32 "certain","cf","choose","contrariwise","cos","could","cu","day","do","does","doesn't",
33 "doing","dost","doth","double","down","dual","during","each","either","else", 33 "doing","dost","doth","double","down","dual","during","each","either","else",
34 "elsewhere","enough","et","etc","even","ever","every","everybody","everyone", 34 "elsewhere","enough","et","etc","even","ever","every","everybody","everyone",
35 "everything","everywhere","except","excepted","excepting","exception","exclude", 35 "everything","everywhere","except","excepted","excepting","exception","exclude",
36 "excluding","exclusive","far","farther","farthest","few","ff","first","for", 36 "excluding","exclusive","far","farther","farthest","few","ff","first","for",
37 "formerly","forth","forward","from","front","further","furthermore","furthest","get", 37 "formerly","forth","forward","from","front","further","furthermore","furthest","get",
38 "go","had","halves","hardly","has","hast","hath","have","he","hence","henceforth", 38 "go","had","halves","hardly","has","hast","hath","have","he","hence","henceforth",
39 "her","here","hereabouts","hereafter","hereby","herein","hereto","hereupon","hers", 39 "her","here","hereabouts","hereafter","hereby","herein","hereto","hereupon","hers",
40 "herself","him","himself","hindmost","his","hither","hitherto","how","however", 40 "herself","him","himself","hindmost","his","hither","hitherto","how","however",
41 "howsoever","i","ie","if","in","inasmuch","inc","include","included","including", 41 "howsoever","i","ie","if","in","inasmuch","inc","include","included","including",
42 "indeed","indoors","inside","insomuch","instead","into","inward","inwards","is", 42 "indeed","indoors","inside","insomuch","instead","into","inward","inwards","is",
43 "it","its","itself","just","kind","kg","km","last","latter","latterly","less","lest", 43 "it","its","itself","just","kind","kg","km","last","latter","latterly","less","lest",
44 "let","like","little","ltd","many","may","maybe","me","meantime","meanwhile","might", 44 "let","like","little","ltd","many","may","maybe","me","meantime","meanwhile","might",
45 "moreover","most","mostly","more","mr","mrs","ms","much","must","my","myself", 45 "moreover","most","mostly","more","mr","mrs","ms","much","must","my","myself",
46 "namely","need","neither","never","nevertheless","next","no","nobody","none", 46 "namely","need","neither","never","nevertheless","next","no","nobody","none",
47 "nonetheless","noone","nope","nor","not","nothing","notwithstanding","now","nowadays", 47 "nonetheless","noone","nope","nor","not","nothing","notwithstanding","now","nowadays",
48 "nowhere","of","off","often","ok","on","once","one","only","onto","or","other", 48 "nowhere","of","off","often","ok","on","once","one","only","onto","or","other",
49 "others","otherwise","ought","our","ours","ourselves","out","outside","over","own", 49 "others","otherwise","ought","our","ours","ourselves","out","outside","over","own",
50 "per","perhaps","plenty","provide","quite","rather","really","round","said","sake", 50 "per","perhaps","plenty","provide","quite","rather","really","round","said","sake",
51 "same","sang","save","saw","see","seeing","seem","seemed","seeming","seems","seen", 51 "same","sang","save","saw","see","seeing","seem","seemed","seeming","seems","seen",
52 "seldom","selves","sent","several","shalt","she","should","shown","sideways","since", 52 "seldom","selves","sent","several","shalt","she","should","shown","sideways","since",
53 "slept","slew","slung","slunk","smote","so","some","somebody","somehow","someone", 53 "slept","slew","slung","slunk","smote","so","some","somebody","somehow","someone",
54 "something","sometime","sometimes","somewhat","somewhere","spake","spat","spoke", 54 "something","sometime","sometimes","somewhat","somewhere","spake","spat","spoke",
55 "spoken","sprang","sprung","stave","staves","still","such","supposing","than","that", 55 "spoken","sprang","sprung","stave","staves","still","such","supposing","than","that",
56 "the","thee","their","them","themselves","then","thence","thenceforth","there", 56 "the","thee","their","them","themselves","then","thence","thenceforth","there",
57 "thereabout","thereabouts","thereafter","thereby","therefore","therein","thereof", 57 "thereabout","thereabouts","thereafter","thereby","therefore","therein","thereof",
58 "thereon","thereto","thereupon","these","they","this","those","thou","though", 58 "thereon","thereto","thereupon","these","they","this","those","thou","though",
59 "thrice","through","throughout","thru","thus","thy","thyself","till","to","together", 59 "thrice","through","throughout","thru","thus","thy","thyself","till","to","together",
60 "too","toward","towards","ugh","unable","under","underneath","unless","unlike", 60 "too","toward","towards","ugh","unable","under","underneath","unless","unlike",
61 "until","up","upon","upward","upwards","us","use","used","using","very","via","vs", 61 "until","up","upon","upward","upwards","us","use","used","using","very","via","vs",
62 "want","was","we","week","well","were","what","whatever","whatsoever","when","whence", 62 "want","was","we","week","well","were","what","whatever","whatsoever","when","whence",
63 "whenever","whensoever","where","whereabouts","whereafter","whereas","whereat", 63 "whenever","whensoever","where","whereabouts","whereafter","whereas","whereat",
64 "whereby","wherefore","wherefrom","wherein","whereinto","whereof","whereon", 64 "whereby","wherefore","wherefrom","wherein","whereinto","whereof","whereon",
65 "wheresoever","whereto","whereunto","whereupon","wherever","wherewith","whether", 65 "wheresoever","whereto","whereunto","whereupon","wherever","wherewith","whether",
66 "whew","which","whichever","whichsoever","while","whilst","whither","who","whoa", 66 "whew","which","whichever","whichsoever","while","whilst","whither","who","whoa",
67 "whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will", 67 "whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will",
68 "wilt","with","within","without","worse","worst","would","wow","ye","yet","year", 68 "wilt","with","within","without","worse","worst","would","wow","ye","yet","year",
69 "yippee","you","your","yours","yourself","yourselves", 69 "yippee","you","your","yours","yourself","yourselves",
70 "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en","html" 70 "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en","html",
71 "amp","nbsp","quot"
71 ] 72 ]
72 73
73 Transmap = { 74 Transmap = {
74 "\xC3\x80" => "A", "\xC3\x81" => "A", "\xC3\x82" => "A", "\xC3\x83" => "A", 75 "\xC3\x80" => "A", "\xC3\x81" => "A", "\xC3\x82" => "A", "\xC3\x83" => "A",
75 "\xC3\x84" => "A", "\xC3\x85" => "A", "\xC3\x86" => "AE","\xC3\x87" => "C", 76 "\xC3\x84" => "A", "\xC3\x85" => "A", "\xC3\x86" => "AE","\xC3\x87" => "C",
76 "\xC3\x88" => "E", "\xC3\x89" => "E", "\xC3\x8A" => "E", "\xC3\x8B" => "E", 77 "\xC3\x88" => "E", "\xC3\x89" => "E", "\xC3\x8A" => "E", "\xC3\x8B" => "E",
77 "\xC3\x8C" => "I", "\xC3\x8D" => "I", "\xC3\x8E" => "I", "\xC3\x8F" => "I", 78 "\xC3\x8C" => "I", "\xC3\x8D" => "I", "\xC3\x8E" => "I", "\xC3\x8F" => "I",
78 "\xC3\x90" => "D", "\xC3\x91" => "N", "\xC3\x92" => "O", "\xC3\x93" => "O", 79 "\xC3\x90" => "D", "\xC3\x91" => "N", "\xC3\x92" => "O", "\xC3\x93" => "O",
79 "\xC3\x94" => "O", "\xC3\x95" => "O", "\xC3\x96" => "O", "\xC3\x98" => "O", 80 "\xC3\x94" => "O", "\xC3\x95" => "O", "\xC3\x96" => "O", "\xC3\x98" => "O",
80 "\xC3\x99" => "U", "\xC3\x9A" => "U", "\xC3\x9B" => "U", "\xC3\x9C" => "U", 81 "\xC3\x99" => "U", "\xC3\x9A" => "U", "\xC3\x9B" => "U", "\xC3\x9C" => "U",
81 "\xC3\x9D" => "Y", "\xC3\x9E" => "P", "\xC3\x9F" => "ss", 82 "\xC3\x9D" => "Y", "\xC3\x9E" => "P", "\xC3\x9F" => "ss",
82 "\xC3\xA0" => "a", "\xC3\xA1" => "a", "\xC3\xA2" => "a", "\xC3\xA3" => "a", 83 "\xC3\xA0" => "a", "\xC3\xA1" => "a", "\xC3\xA2" => "a", "\xC3\xA3" => "a",
83 "\xC3\xA4" => "a", "\xC3\xA5" => "a", "\xC3\xA6" => "ae","\xC3\xA7" => "c", 84 "\xC3\xA4" => "a", "\xC3\xA5" => "a", "\xC3\xA6" => "ae","\xC3\xA7" => "c",
84 "\xC3\xA8" => "e", "\xC3\xA9" => "e", "\xC3\xAA" => "e", "\xC3\xAB" => "e", 85 "\xC3\xA8" => "e", "\xC3\xA9" => "e", "\xC3\xAA" => "e", "\xC3\xAB" => "e",
85 "\xC3\xAC" => "i", "\xC3\xAD" => "i", "\xC3\xAE" => "i", "\xC3\xAF" => "i", 86 "\xC3\xAC" => "i", "\xC3\xAD" => "i", "\xC3\xAE" => "i", "\xC3\xAF" => "i",
86 "\xC3\xB0" => "o", "\xC3\xB1" => "n", "\xC3\xB2" => "o", "\xC3\xB3" => "o", 87 "\xC3\xB0" => "o", "\xC3\xB1" => "n", "\xC3\xB2" => "o", "\xC3\xB3" => "o",
87 "\xC3\xB4" => "o", "\xC3\xB5" => "o", "\xC3\xB6" => "o", "\xC3\xB8" => "o", 88 "\xC3\xB4" => "o", "\xC3\xB5" => "o", "\xC3\xB6" => "o", "\xC3\xB8" => "o",
88 "\xC3\xB9" => "u", "\xC3\xBA" => "u", "\xC3\xBB" => "u", "\xC3\xBC" => "u", 89 "\xC3\xB9" => "u", "\xC3\xBA" => "u", "\xC3\xBB" => "u", "\xC3\xBC" => "u",
89 "\xC3\xBD" => "y", "\xC3\xBE" => "p", "\xC3\xBF" => "y", 90 "\xC3\xBD" => "y", "\xC3\xBE" => "p", "\xC3\xBF" => "y",
90 "\xC4\x80" => "A", "\xC4\x81" => "a", "\xC4\x82" => "A", "\xC4\x83" => "a", 91 "\xC4\x80" => "A", "\xC4\x81" => "a", "\xC4\x82" => "A", "\xC4\x83" => "a",
91 "\xC4\x84" => "A", "\xC4\x85" => "a", "\xC4\x86" => "C", "\xC4\x87" => "c", 92 "\xC4\x84" => "A", "\xC4\x85" => "a", "\xC4\x86" => "C", "\xC4\x87" => "c",
92 "\xC4\x88" => "C", "\xC4\x89" => "c", "\xC4\x8A" => "C", "\xC4\x8B" => "c", 93 "\xC4\x88" => "C", "\xC4\x89" => "c", "\xC4\x8A" => "C", "\xC4\x8B" => "c",
93 "\xC4\x8C" => "C", "\xC4\x8D" => "c", "\xC4\x8E" => "D", "\xC4\x8F" => "d", 94 "\xC4\x8C" => "C", "\xC4\x8D" => "c", "\xC4\x8E" => "D", "\xC4\x8F" => "d",
94 "\xC4\x90" => "D", "\xC4\x91" => "d", "\xC4\x92" => "E", "\xC4\x93" => "e", 95 "\xC4\x90" => "D", "\xC4\x91" => "d", "\xC4\x92" => "E", "\xC4\x93" => "e",
95 "\xC4\x94" => "E", "\xC4\x95" => "e", "\xC4\x96" => "E", "\xC4\x97" => "e", 96 "\xC4\x94" => "E", "\xC4\x95" => "e", "\xC4\x96" => "E", "\xC4\x97" => "e",
96 "\xC4\x98" => "E", "\xC4\x99" => "e", "\xC4\x9A" => "E", "\xC4\x9B" => "e", 97 "\xC4\x98" => "E", "\xC4\x99" => "e", "\xC4\x9A" => "E", "\xC4\x9B" => "e",
97 "\xC4\x9C" => "G", "\xC4\x9D" => "g", "\xC4\x9E" => "G", "\xC4\x9F" => "g", 98 "\xC4\x9C" => "G", "\xC4\x9D" => "g", "\xC4\x9E" => "G", "\xC4\x9F" => "g",
98 "\xC4\xA0" => "G", "\xC4\xA1" => "g", "\xC4\xA2" => "G", "\xC4\xA3" => "g", 99 "\xC4\xA0" => "G", "\xC4\xA1" => "g", "\xC4\xA2" => "G", "\xC4\xA3" => "g",
99 "\xC4\xA4" => "H", "\xC4\xA5" => "h", "\xC4\xA6" => "H", "\xC4\xA7" => "h", 100 "\xC4\xA4" => "H", "\xC4\xA5" => "h", "\xC4\xA6" => "H", "\xC4\xA7" => "h",
100 "\xC4\xA8" => "I", "\xC4\xA9" => "i", "\xC4\xAA" => "I", "\xC4\xAB" => "i", 101 "\xC4\xA8" => "I", "\xC4\xA9" => "i", "\xC4\xAA" => "I", "\xC4\xAB" => "i",
101 "\xC4\xAC" => "I", "\xC4\xAD" => "i", "\xC4\xAE" => "I", "\xC4\xAF" => "i", 102 "\xC4\xAC" => "I", "\xC4\xAD" => "i", "\xC4\xAE" => "I", "\xC4\xAF" => "i",
102 "\xC4\xB0" => "I", "\xC4\xB1" => "i", "\xC4\xB2" => "IJ","\xC4\xB3" => "ij", 103 "\xC4\xB0" => "I", "\xC4\xB1" => "i", "\xC4\xB2" => "IJ","\xC4\xB3" => "ij",
103 "\xC4\xB4" => "J", "\xC4\xB5" => "j", "\xC4\xB6" => "K", "\xC4\xB7" => "k", 104 "\xC4\xB4" => "J", "\xC4\xB5" => "j", "\xC4\xB6" => "K", "\xC4\xB7" => "k",
104 "\xC4\xB8" => "k", "\xC4\xB9" => "L", "\xC4\xBA" => "l", "\xC4\xBB" => "L", 105 "\xC4\xB8" => "k", "\xC4\xB9" => "L", "\xC4\xBA" => "l", "\xC4\xBB" => "L",
105 "\xC4\xBC" => "l", "\xC4\xBD" => "L", "\xC4\xBE" => "l", "\xC4\xBF" => "L", 106 "\xC4\xBC" => "l", "\xC4\xBD" => "L", "\xC4\xBE" => "l", "\xC4\xBF" => "L",
106 "\xC5\x80" => "l", "\xC5\x81" => "L", "\xC5\x82" => "l", "\xC5\x83" => "N", 107 "\xC5\x80" => "l", "\xC5\x81" => "L", "\xC5\x82" => "l", "\xC5\x83" => "N",
107 "\xC5\x84" => "n", "\xC5\x85" => "N", "\xC5\x86" => "n", "\xC5\x87" => "N", 108 "\xC5\x84" => "n", "\xC5\x85" => "N", "\xC5\x86" => "n", "\xC5\x87" => "N",
108 "\xC5\x88" => "n", "\xC5\x89" => "n", "\xC5\x8A" => "N", "\xC5\x8B" => "n", 109 "\xC5\x88" => "n", "\xC5\x89" => "n", "\xC5\x8A" => "N", "\xC5\x8B" => "n",
109 "\xC5\x8C" => "O", "\xC5\x8D" => "o", "\xC5\x8E" => "O", "\xC5\x8F" => "o", 110 "\xC5\x8C" => "O", "\xC5\x8D" => "o", "\xC5\x8E" => "O", "\xC5\x8F" => "o",
110 "\xC5\x90" => "O", "\xC5\x91" => "o", "\xC5\x92" => "CE","\xC5\x93" => "ce", 111 "\xC5\x90" => "O", "\xC5\x91" => "o", "\xC5\x92" => "CE","\xC5\x93" => "ce",
111 "\xC5\x94" => "R", "\xC5\x95" => "r", "\xC5\x96" => "R", "\xC5\x97" => "r", 112 "\xC5\x94" => "R", "\xC5\x95" => "r", "\xC5\x96" => "R", "\xC5\x97" => "r",
112 "\xC5\x98" => "R", "\xC5\x99" => "r", "\xC5\x9A" => "S", "\xC5\x9B" => "s", 113 "\xC5\x98" => "R", "\xC5\x99" => "r", "\xC5\x9A" => "S", "\xC5\x9B" => "s",
113 "\xC5\x9C" => "S", "\xC5\x9D" => "s", "\xC5\x9E" => "S", "\xC5\x9F" => "s", 114 "\xC5\x9C" => "S", "\xC5\x9D" => "s", "\xC5\x9E" => "S", "\xC5\x9F" => "s",
114 "\xC5\xA0" => "S", "\xC5\xA1" => "s", "\xC5\xA2" => "T", "\xC5\xA3" => "t", 115 "\xC5\xA0" => "S", "\xC5\xA1" => "s", "\xC5\xA2" => "T", "\xC5\xA3" => "t",
115 "\xC5\xA4" => "T", "\xC5\xA5" => "t", "\xC5\xA6" => "T", "\xC5\xA7" => "t", 116 "\xC5\xA4" => "T", "\xC5\xA5" => "t", "\xC5\xA6" => "T", "\xC5\xA7" => "t",
116 "\xC5\xA8" => "U", "\xC5\xA9" => "u", "\xC5\xAA" => "U", "\xC5\xAB" => "u", 117 "\xC5\xA8" => "U", "\xC5\xA9" => "u", "\xC5\xAA" => "U", "\xC5\xAB" => "u",
117 "\xC5\xAC" => "U", "\xC5\xAD" => "u", "\xC5\xAE" => "U", "\xC5\xAF" => "u", 118 "\xC5\xAC" => "U", "\xC5\xAD" => "u", "\xC5\xAE" => "U", "\xC5\xAF" => "u",
118 "\xC5\xB0" => "U", "\xC5\xB1" => "u", "\xC5\xB2" => "U", "\xC5\xB3" => "u", 119 "\xC5\xB0" => "U", "\xC5\xB1" => "u", "\xC5\xB2" => "U", "\xC5\xB3" => "u",
119 "\xC5\xB4" => "W", "\xC5\xB5" => "w", "\xC5\xB6" => "Y", "\xC5\xB7" => "y", 120 "\xC5\xB4" => "W", "\xC5\xB5" => "w", "\xC5\xB6" => "Y", "\xC5\xB7" => "y",
120 "\xC5\xB8" => "Y", "\xC5\xB9" => "Z", "\xC5\xBA" => "z", "\xC5\xBB" => "Z", 121 "\xC5\xB8" => "Y", "\xC5\xB9" => "Z", "\xC5\xBA" => "z", "\xC5\xBB" => "Z",
121 "\xC5\xBC" => "z", "\xC5\xBD" => "Z", "\xC5\xBE" => "z", "\xC6\x8F" => "E", 122 "\xC5\xBC" => "z", "\xC5\xBD" => "Z", "\xC5\xBE" => "z", "\xC6\x8F" => "E",
122 "\xC6\xA0" => "O", "\xC6\xA1" => "o", "\xC6\xAF" => "U", "\xC6\xB0" => "u", 123 "\xC6\xA0" => "O", "\xC6\xA1" => "o", "\xC6\xAF" => "U", "\xC6\xB0" => "u",
123 "\xC7\x8D" => "A", "\xC7\x8E" => "a", "\xC7\x8F" => "I", 124 "\xC7\x8D" => "A", "\xC7\x8E" => "a", "\xC7\x8F" => "I",
124 "\xC7\x90" => "i", "\xC7\x91" => "O", "\xC7\x92" => "o", "\xC7\x93" => "U", 125 "\xC7\x90" => "i", "\xC7\x91" => "O", "\xC7\x92" => "o", "\xC7\x93" => "U",
125 "\xC7\x94" => "u", "\xC7\x95" => "U", "\xC7\x96" => "u", "\xC7\x97" => "U", 126 "\xC7\x94" => "u", "\xC7\x95" => "U", "\xC7\x96" => "u", "\xC7\x97" => "U",
126 "\xC7\x98" => "u", "\xC7\x99" => "U", "\xC7\x9A" => "u", "\xC7\x9B" => "U", 127 "\xC7\x98" => "u", "\xC7\x99" => "U", "\xC7\x9A" => "u", "\xC7\x9B" => "U",
127 "\xC7\x9C" => "u", 128 "\xC7\x9C" => "u",
128 "\xC7\xBA" => "A", "\xC7\xBB" => "a", "\xC7\xBC" => "AE","\xC7\xBD" => "ae", 129 "\xC7\xBA" => "A", "\xC7\xBB" => "a", "\xC7\xBC" => "AE","\xC7\xBD" => "ae",
129 "\xC7\xBE" => "O", "\xC7\xBF" => "o", 130 "\xC7\xBE" => "O", "\xC7\xBF" => "o",
130 "\xC9\x99" => "e", 131 "\xC9\x99" => "e",
131 "\xC2\x82" => ",", # High code comma 132 "\xC2\x82" => ",", # High code comma
132 "\xC2\x84" => ",,", # High code double comma 133 "\xC2\x84" => ",,", # High code double comma
133 "\xC2\x85" => "...", # Tripple dot 134 "\xC2\x85" => "...", # Tripple dot
134 "\xC2\x88" => "^", # High carat 135 "\xC2\x88" => "^", # High carat
135 "\xC2\x91" => "\x27", # Forward single quote 136 "\xC2\x91" => "\x27", # Forward single quote
136 "\xC2\x92" => "\x27", # Reverse single quote 137 "\xC2\x92" => "\x27", # Reverse single quote
137 "\xC2\x93" => "\x22", # Forward double quote 138 "\xC2\x93" => "\x22", # Forward double quote
138 "\xC2\x94" => "\x22", # Reverse double quote 139 "\xC2\x94" => "\x22", # Reverse double quote
139 "\xC2\x96" => "-", # High hyphen 140 "\xC2\x96" => "-", # High hyphen
140 "\xC2\x97" => "--", # Double hyphen 141 "\xC2\x97" => "--", # Double hyphen
141 "\xC2\xA6" => "|", # Split vertical bar 142 "\xC2\xA6" => "|", # Split vertical bar
142 "\xC2\xAB" => "<<", # Double less than 143 "\xC2\xAB" => "<<", # Double less than
143 "\xC2\xBB" => ">>", # Double greater than 144 "\xC2\xBB" => ">>", # Double greater than
144 "\xC2\xBC" => "1/4", # one quarter 145 "\xC2\xBC" => "1/4", # one quarter
145 "\xC2\xBD" => "1/2", # one half 146 "\xC2\xBD" => "1/2", # one half
146 "\xC2\xBE" => "3/4", # three quarters 147 "\xC2\xBE" => "3/4", # three quarters
147 "\xCA\xBF" => "\x27", # c-single quote 148 "\xCA\xBF" => "\x27", # c-single quote
148 "\xCC\xA8" => "", # modifier - under curve 149 "\xCC\xA8" => "", # modifier - under curve
149 "\xCC\xB1" => "", # modifier - under line 150 "\xCC\xB1" => "", # modifier - under line
150 # /\W/ => "" 151 # /\W/ => ""
151 } 152 }
152 153
153 end 154 end
154 155
155 # Extention of the standard class String with useful function. 156 # Extention of the standard class String with useful function.
156 class String 157 class String
157 include Mirimiri 158 include Mirimiri
158 159
159 def unaccent 160 def unaccent
160 # force_encoding is needed with ruby1.9 161 # force_encoding is needed with ruby1.9
162 # Transmap.inject(self) { |str, (utf8, asc)| str.gsub(utf8, asc) }
161 Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) } 163 Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) }
162 end 164 end
163 165
164 # Returns +true+ if +self+ belongs to Mirimiri::Stoplist, +false+ otherwise. 166 # Returns +true+ if +self+ belongs to Mirimiri::Stoplist, +false+ otherwise.
165 def is_stopword? 167 def is_stopword?
166 self.split.all? { |e| Stoplist.include?(e.downcase) } 168 self.split.all? { |e| Stoplist.include?(e.downcase) }
167 end 169 end
168 170
169 def sequential_dependence_model t=0.85,o=0.10,u=0.05,field=nil 171 def is_integer?
172 !self.empty? && self =~ /\A\d+\Z/
173 end
174
175 def numeric?
176 Float(self) != nil rescue false
177 end
178
179 def sequential_dependence_model field=nil,t=0.85,o=0.10,u=0.05
170 d = Mirimiri::Document.new self 180 d = Mirimiri::Document.new self
171 181
172 if field.nil? 182 if field.nil?
173 ematch = d.ngrams(2).collect { |ng| "#1(#{ng})" } 183 ematch = d.ngrams(2).collect { |ng| "#1(#{ng})" }
174 pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng})" } 184 pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng})" }
175 else 185 else
176 ematch = d.ngrams(2).collect { |ng| "#1(#{ng}).(#{field})" } 186 ematch = d.ngrams(2).collect { |ng| "#1(#{ng}).(#{field})" }
177 pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng}).(#{field})" } 187 pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng}).(#{field})" }
178 end 188 end
179 189
180 if ematch.empty? 190 if ematch.empty?
181 if field.nil? 191 if field.nil?
182 ematch = d.words.collect { |ng| "#1(#{ng})" } 192 ematch = d.words.collect { |ng| "#1(#{ng})" }
183 pmatch = d.words.collect { |ng| "#uw8(#{ng})" } 193 pmatch = d.words.collect { |ng| "#uw8(#{ng})" }
184 else 194 else
185 ematch = d.words.collect { |ng| "#1(#{ng}).(#{field})" } 195 ematch = d.words.collect { |ng| "#1(#{ng}).(#{field})" }
186 pmatch = d.words.collect { |ng| "#uw8(#{ng}).(#{field})" } 196 pmatch = d.words.collect { |ng| "#uw8(#{ng}).(#{field})" }
187 end 197 end
188 end 198 end
189 199
190 "#weight ( #{t} #combine( #{d.words.join(" ")} ) #{o} #combine ( #{ematch.join(" ")} ) #{u} #combine ( #{pmatch.join(" ")} ) )" 200 "#weight ( #{t} #combine( #{d.words.join(" ")} ) #{o} #combine ( #{ematch.join(" ")} ) #{u} #combine ( #{pmatch.join(" ")} ) )"
191 end 201 end
192 202
193 # Do not use. 203 # Do not use.
194 # TODO: rewamp. find why this function is here. 204 # TODO: rewamp. find why this function is here.
195 def remove_special_characters 205 def remove_special_characters
196 self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ') 206 self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ')
197 end 207 end
198 208
199 # Removes all XML-like tags from +self+. 209 # Removes all XML-like tags from +self+.
200 # 210 #
201 # s = "<html><body>test</body></html>" 211 # s = "<html><body>test</body></html>"
202 # s.strip_xml_tags! 212 # s.strip_xml_tags!
203 # s #=> "test" 213 # s #=> "test"
204 def strip_xml_tags! 214 def strip_xml_tags!
205 replace strip_with_pattern /<\/?[^>]*>/ 215 replace strip_with_pattern /<\/?[^>]*>/
206 end 216 end
207 217
208 # Removes all XML-like tags from +self+. 218 # Removes all XML-like tags from +self+.
209 # 219 #
210 # s = "<html><body>test</body></html>" 220 # s = "<html><body>test</body></html>"
211 # s.strip_xml_tags #=> "test" 221 # s.strip_xml_tags #=> "test"
212 # s #=> "<html><body>test</body></html>" 222 # s #=> "<html><body>test</body></html>"
213 def strip_xml_tags 223 def strip_xml_tags
214 dup.strip_xml_tags! 224 dup.strip_xml_tags!
215 end 225 end
216 226
217 # Removes all Javascript sources from +self+. 227 # Removes all Javascript sources from +self+.
218 # 228 #
219 # s = "<script type='text/javascript'> 229 # s = "<script type='text/javascript'>
220 # var skin='vector', 230 # var skin='vector',
221 # stylepath='http://bits.wikimedia.org/skins-1.5' 231 # stylepath='http://bits.wikimedia.org/skins-1.5'
222 # </script> 232 # </script>
223 # 233 #
224 # test" 234 # test"
225 # s.strip_javascripts! 235 # s.strip_javascripts!
226 # s #=> "test" 236 # s #=> "test"
227 def strip_javascripts! 237 def strip_javascripts!
228 replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m 238 replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m
229 end 239 end
230 240
231 # Removes all Javascript sources from +self+. 241 # Removes all Javascript sources from +self+.
232 # 242 #
233 # s = "<script type='text/javascript'> 243 # s = "<script type='text/javascript'>
234 # var skin='vector', 244 # var skin='vector',
235 # stylepath='http://bits.wikimedia.org/skins-1.5' 245 # stylepath='http://bits.wikimedia.org/skins-1.5'
236 # </script> 246 # </script>
237 # 247 #
238 # test" 248 # test"
239 # s.strip_javascripts #=> "test" 249 # s.strip_javascripts #=> "test"
240 def strip_javascripts 250 def strip_javascripts
241 dup.strip_javascripts! 251 dup.strip_javascripts!
242 end 252 end
243 253
244 def strip_stylesheets! 254 def strip_stylesheets!
245 # TODO: rewamp. dunno what is it. 255 # TODO: rewamp. dunno what is it.
246 replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m 256 replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m
247 end 257 end
248 258
249 def strip_stylesheets 259 def strip_stylesheets
250 dup.strip_stylesheets! 260 dup.strip_stylesheets!
251 end 261 end
252 262
253 # Removes punctuation from +self+. 263 # Removes punctuation from +self+.
254 # 264 #
255 # s = "hello, world. how are you?!" 265 # s = "hello, world. how are you?!"
256 # s.strip_punctuation! 266 # s.strip_punctuation!
257 # s # => "hello world how are you" 267 # s # => "hello world how are you"
258 def strip_punctuation! 268 def strip_punctuation!
259 replace strip_with_pattern /[^a-zA-Z0-9\-\s]/ 269 replace strip_with_pattern /[^a-zA-Z0-9\-\s]/
260 end 270 end
261 271
262 # Removes punctuation from +self+. 272 # Removes punctuation from +self+.
263 # 273 #
264 # s = "hello, world. how are you?!" 274 # s = "hello, world. how are you?!"
265 # s.strip_punctuation # => "hello world how are you" 275 # s.strip_punctuation # => "hello world how are you"
266 def strip_punctuation 276 def strip_punctuation
267 dup.strip_punctuation! 277 dup.strip_punctuation!
268 end 278 end
269 279
270 # Returns the text values inside all occurences of a XML tag in +self+ 280 # Returns the text values inside all occurences of a XML tag in +self+
271 # 281 #
272 # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre" 282 # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre"
273 # s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"] 283 # s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"]
274 def extract_xmltags_values(tag_name) 284 def extract_xmltags_values(tag_name)
275 self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten 285 self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten
276 end 286 end
277 287
278 def strip_with_pattern(pattern) 288 def strip_with_pattern(pattern)
279 require 'cgi' 289 require 'cgi'
280 290
281 CGI::unescapeHTML(self.gsub(pattern,"")).unaccent.encode("UTF-8", {:invalid => :replace, :undef => :replace, :replace => " "}) 291 CGI::unescapeHTML(self.gsub(pattern,"")).unaccent.encode("UTF-8", {:invalid => :replace, :undef => :replace, :replace => " "})
282 end 292 end
283 293
284 private :strip_with_pattern 294 private :strip_with_pattern
285 end 295 end
286 296
287 module Indri 297 module Indri
288 class IndriPrintedDocuments < String 298 class IndriPrintedDocuments < String
289 299
290 def extract_docs 300 def extract_docs
291 self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? } 301 self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? }
302 end
303
304 def extract_docs_score
305 score = self.scan(/\d+ Q0 .+ \d+ (-\d+.\d+) .+/).flatten
306 name = self.scan(/\d+ Q0 (.+) \d+ -\d+.\d+ .+/).collect { |n| n.first.scan(/(\d+).xml/).first }
307 return self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? },score,name
292 end 308 end
293 end 309 end
294 end 310 end
295 311
1 $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib")) 1 $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib"))
2 2
3 require 'mirimiri' 3 require 'mirimiri'
4 require "benchmark" 4 require "benchmark"
5 5
6 # Fetch the text content of two Wikipedia pages using their URLs
6 w = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan") 7 w = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan")
8 u = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/Pantera")
9
10 # Compute the entropy of a word sequence, using `w` as context
7 p w.entropy("dillinger escape plan") 11 p w.entropy("dillinger escape plan")
8 p w.tf("guitar") 12 p w.tf("guitar")
9 13
14 # Compute the KL-Divergence between the two pages
15 p w.kl u
16
17
18 # Mirimiri also comprises Indri-related classes
19
20 # Building an Indri query
10 query = Indri::IndriQuery.new({:query => "dillinger escape plan".sequential_dependence_model, :count => 10}, "-trecFormat=true -printDocuments=true") 21 query = Indri::IndriQuery.new({:query => "dillinger escape plan".sequential_dependence_model, :count => 10}, "-trecFormat=true -printDocuments=true")
22
23 # Initializing the index on which the query will be executed
24 # Must have been previously built using `IndriBuildIndex`
11 index = Indri::IndriIndex.new "/mnt/disk1/ClueWeb09_English_1noSpam" 25 index = Indri::IndriIndex.new "/mnt/disk1/ClueWeb09_English_1noSpam"
26
27 # Run the query on the index and fetch the text of the documents
12 s = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8")) 28 s = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))
13 29