Commit b0ffa2ad49638e2a223fff528de1a4ad336acb72
1 parent
b3c0213975
Exists in
master
finally committing some recent changes
Showing 7 changed files with 146 additions and 16 deletions Side-by-side Diff
README.markdown
1 | 1 | # mirimiri |
2 | 2 | |
3 | -Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | |
3 | +The various tools of this project were developed for research purposes during | |
4 | +my Ph.D. and heavily rely on the use of Indri (<http://lemurproject.org/indri.php>). | |
5 | +Setting up Ruby is not as painful as it used to be since RVM (<https://rvm.io/>), | |
6 | +visit at least these two websites before trying to use `mirimiri`. | |
7 | + | |
8 | + | |
9 | +Copyright (C) 2010-2013 Romain Deveaud <romain.deveaud@gmail.com> | |
4 | 10 | |
5 | 11 | > The Fijian monkey-faced bat (Mirimiri acrodonta), also called the Fiji |
6 | 12 | > Flying Fox, is an Old World fruit bat endemic to Fiji. It was discovered |
lib/mirimiri/document.rb
... | ... | @@ -25,7 +25,7 @@ |
25 | 25 | |
26 | 26 | # A Document is a bag of words and is constructed from a string. |
27 | 27 | class Document |
28 | - attr_reader :words, :doc_content, :count_words | |
28 | + attr_reader :words, :doc_content, :xcount | |
29 | 29 | |
30 | 30 | # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html |
31 | 31 | # and the \\W special escape). |
... | ... | @@ -36,7 +36,7 @@ |
36 | 36 | |
37 | 37 | @doc_content.split.each do |w| |
38 | 38 | w.split(/\W/).each do |sw| |
39 | - wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ | |
39 | + wo.push(sw.downcase) if sw =~ /[[:alpha:]]/ | |
40 | 40 | end |
41 | 41 | end |
42 | 42 | |
... | ... | @@ -80,7 +80,7 @@ |
80 | 80 | en = 0.0 |
81 | 81 | |
82 | 82 | s.split.each do |w| |
83 | - p_wi = @count_words[w].to_f/@words.count.to_f | |
83 | + p_wi = @xcount[w].to_f/@words.count.to_f | |
84 | 84 | en += p_wi*Math.log2(p_wi) |
85 | 85 | end |
86 | 86 | |
... | ... | @@ -101,7 +101,7 @@ |
101 | 101 | size = s.split.size |
102 | 102 | |
103 | 103 | if size == 1 |
104 | - p_wi = @count_words[s].to_f/@words.count.to_f | |
104 | + p_wi = @xcount[s].to_f/@words.count.to_f | |
105 | 105 | en += p_wi*Math.log(p_wi) |
106 | 106 | elsif size > 1 |
107 | 107 | ng_size = ngrams(size) |
108 | 108 | |
109 | 109 | |
110 | 110 | |
... | ... | @@ -117,14 +117,28 @@ |
117 | 117 | # |
118 | 118 | # tf("guitar") #=> 0.000380372765310004 |
119 | 119 | def tf(s) |
120 | - @count_words[s].to_f/@words.size.to_f | |
120 | + @xcount[s].to_f/@words.size.to_f | |
121 | 121 | end |
122 | 122 | |
123 | + # Computes the KL divergence between the language model of the +self+ | |
124 | + # and the language model of +doc+. | |
125 | + # | |
126 | + # KL is not symmetric, see http://en.wikipedia.org/wiki/Kullback-Leibler_divergence | |
127 | + # for more information. | |
128 | + # | |
129 | + # d1.kl(d2) #=> 0.2971808085725761 | |
130 | + def kl(doc) | |
131 | + raise ArgumentError, 'Argument is not a Mirimiri::Document' unless doc.is_a? Mirimiri::Document | |
132 | + | |
133 | + vocab = self.words & doc.words | |
123 | 134 | |
135 | + vocab.inject(0.0) { |res,w| res + self.tf(w)*Math.log(self.tf(w)/doc.tf(w)) } | |
136 | + end | |
137 | + | |
124 | 138 | def initialize(content="") |
125 | 139 | @doc_content = content |
126 | 140 | @words = format_words |
127 | - @count_words = count_words | |
141 | + @xcount = count_words | |
128 | 142 | @ngrams = {} |
129 | 143 | end |
130 | 144 | |
... | ... | @@ -149,7 +163,7 @@ |
149 | 163 | |
150 | 164 | @url = url |
151 | 165 | content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("") |
152 | - super Sanitize.clean(content.unaccent.toutf8.force_encoding("UTF-8"), :remove_contents => ['script']) | |
166 | + super Sanitize.clean(content, :remove_contents => ['script','style']) | |
153 | 167 | end |
154 | 168 | end |
155 | 169 | |
156 | 170 | |
... | ... | @@ -161,9 +175,9 @@ |
161 | 175 | |
162 | 176 | |
163 | 177 | def self.search_wikipedia_titles(name) |
164 | - raise ArgumentError, "Bad encoding", name unless name.isutf8 | |
178 | +# raise ArgumentError, "Bad encoding", name unless name.isutf8 | |
165 | 179 | |
166 | - res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).unaccent.toutf8).elements['api/query/search'] | |
180 | + res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&srlimit=20&format=xml" ).force_encoding("ISO-8859-1").encode("UTF-8")).elements['api/query/search'] | |
167 | 181 | |
168 | 182 | res.collect { |e| e.attributes['title'] } unless res.nil? |
169 | 183 | end |
lib/mirimiri/index.rb
... | ... | @@ -32,7 +32,7 @@ |
32 | 32 | end |
33 | 33 | |
34 | 34 | def runquery indriquery |
35 | - raise ArgumentError, 'Argument is not an IndriQuery' unless indriquery.is_a? Indri::IndriQuery | |
35 | + raise ArgumentError, 'Argument is not an Indri::IndriQuery' unless indriquery.is_a? Indri::IndriQuery | |
36 | 36 | |
37 | 37 | query = "IndriRunQuery -query=\"#{indriquery.query}\" -index=#{@path}" |
38 | 38 |
lib/mirimiri/query.rb
... | ... | @@ -20,6 +20,9 @@ |
20 | 20 | #++ |
21 | 21 | |
22 | 22 | class Query |
23 | + attr_accessor :query | |
24 | + | |
25 | + | |
23 | 26 | end |
24 | 27 | |
25 | 28 | module Indri |
... | ... | @@ -27,7 +30,7 @@ |
27 | 30 | class Parameters |
28 | 31 | attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline |
29 | 32 | |
30 | - def initialize(corpus,count="1000",mem="1g",threads="1",offset="1",run_id="default",print_query=false,print_docs=false) | |
33 | + def initialize(corpus,count="1000",mem="1g",threads="1",offset="1",run_id="default",print_passages=false,print_query=false,print_docs=false) | |
31 | 34 | @index_path = corpus |
32 | 35 | @memory = mem |
33 | 36 | @count = count |
34 | 37 | |
... | ... | @@ -36,11 +39,15 @@ |
36 | 39 | @run_id = run_id |
37 | 40 | @print_query = print_query ? "true" : "false" |
38 | 41 | @print_docs = print_docs ? "true" : "false" |
42 | + @print_passages = print_passages ? "true" : "false" | |
43 | + @indexes = [corpus] | |
39 | 44 | end |
40 | 45 | |
41 | 46 | def to_s |
42 | 47 | h = "<memory>#{@memory}</memory>\n" |
43 | - h += "<index>#{@index_path}</index>\n" | |
48 | + @indexes.each do |i| | |
49 | + h += "<index>#{i}</index>\n" | |
50 | + end | |
44 | 51 | h += "<count>#{@count}</count>\n" |
45 | 52 | h += "<threads>#{@threads}</threads>\n" |
46 | 53 | unless @baseline.nil? |
47 | 54 | |
... | ... | @@ -51,11 +58,16 @@ |
51 | 58 | h += "<trecFormat>true</trecFormat>\n" |
52 | 59 | h += "<queryOffset>#{@offset}</queryOffset>\n" |
53 | 60 | h += "<runID>#{@run_id}</runID>\n" |
61 | + h += "<printPassages>#{@print_passages}</printPassages>\n" | |
54 | 62 | h += "<printQuery>#{@print_query}</printQuery>\n" |
55 | 63 | h += "<printDocuments>#{@print_docs}</printDocuments>\n" |
56 | 64 | |
57 | 65 | h |
58 | 66 | end |
67 | + | |
68 | + def add_index path | |
69 | + @indexes << path | |
70 | + end | |
59 | 71 | end |
60 | 72 | |
61 | 73 | class IndriQueryOld < Query |
... | ... | @@ -91,6 +103,10 @@ |
91 | 103 | |
92 | 104 | raise ArgumentError, 'Argument 2 must be a String' unless (args.is_a?(String) || args.nil?) |
93 | 105 | @args = args |
106 | + end | |
107 | + | |
108 | + def clarity index_path,terms=10,documents=5 | |
109 | + `clarity -index=#{index_path} -documents=#{documents} -terms=#{terms} -smoothing=\"method:#{@sm_method},#{@sm_param}:#{@sm_value}\" -query=\"#{query}\"`.split("=").last.strip | |
94 | 110 | end |
95 | 111 | end |
96 | 112 |
lib/mirimiri/result.rb
1 | +#!/usr/bin/env ruby | |
2 | + | |
3 | +#-- | |
4 | +# This file is a part of the mirimiri library | |
5 | +# | |
6 | +# Copyright (C) 2010-2012 Romain Deveaud <romain.deveaud@gmail.com> | |
7 | +# | |
8 | +# This program is free software: you can redistribute it and/or modify | |
9 | +# it under the terms of the GNU General Public License as published by | |
10 | +# the Free Software Foundation, either version 3 of the License, or | |
11 | +# (at your option) any later version. | |
12 | +# | |
13 | +# This program is distributed in the hope that it will be useful, | |
14 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | +# GNU General Public License for more details. | |
17 | +# | |
18 | +# You should have received a copy of the GNU General Public License | |
19 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +#++ | |
21 | + | |
22 | +module Mirimiri | |
23 | + | |
24 | + # This class represents one line of a TREC-formatted retrieval | |
25 | + # result. Typical output of Indri or Terrier. | |
26 | + class TrecResult | |
27 | + attr_accessor :topic, :doc, :rank, :score, :run | |
28 | + | |
29 | + def initialize arg | |
30 | + t = arg.split | |
31 | + @topic = t[0] | |
32 | + @doc = t[2] | |
33 | + @rank = t[3] | |
34 | + @score = t[4] | |
35 | + @run = t[5] | |
36 | + end | |
37 | + end | |
38 | + | |
39 | + # This class represents the output of trec_eval, when | |
40 | + # -q option is given. | |
41 | + class TrecEval | |
42 | + attr_accessor :metric, :run, :queries | |
43 | + | |
44 | + def initialize arg | |
45 | + @queries = {} | |
46 | + | |
47 | + arg.each_line do |line| | |
48 | + t = line.split | |
49 | + @metric = t[0] if @metric.nil? | |
50 | + @queries[t[1]] = t[2].to_f if t[1].is_integer? | |
51 | + end | |
52 | + end | |
53 | + end | |
54 | + | |
55 | + # An array of TrecResult, or a run. | |
56 | + class TrecResults < Array | |
57 | + | |
58 | + def initialize args | |
59 | + super args.collect { |res| TrecResult.new res } | |
60 | + end | |
61 | + end | |
62 | +end |
lib/mirimiri/string.rb
... | ... | @@ -67,7 +67,8 @@ |
67 | 67 | "whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will", |
68 | 68 | "wilt","with","within","without","worse","worst","would","wow","ye","yet","year", |
69 | 69 | "yippee","you","your","yours","yourself","yourselves", |
70 | - "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en","html" | |
70 | + "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en","html", | |
71 | + "amp","nbsp","quot" | |
71 | 72 | ] |
72 | 73 | |
73 | 74 | Transmap = { |
... | ... | @@ -158,6 +159,7 @@ |
158 | 159 | |
159 | 160 | def unaccent |
160 | 161 | # force_encoding is needed with ruby1.9 |
162 | +# Transmap.inject(self) { |str, (utf8, asc)| str.gsub(utf8, asc) } | |
161 | 163 | Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) } |
162 | 164 | end |
163 | 165 | |
... | ... | @@ -166,7 +168,15 @@ |
166 | 168 | self.split.all? { |e| Stoplist.include?(e.downcase) } |
167 | 169 | end |
168 | 170 | |
169 | - def sequential_dependence_model t=0.85,o=0.10,u=0.05,field=nil | |
171 | + def is_integer? | |
172 | + !self.empty? && self =~ /\A\d+\Z/ | |
173 | + end | |
174 | + | |
175 | + def numeric? | |
176 | + Float(self) != nil rescue false | |
177 | + end | |
178 | + | |
179 | + def sequential_dependence_model field=nil,t=0.85,o=0.10,u=0.05 | |
170 | 180 | d = Mirimiri::Document.new self |
171 | 181 | |
172 | 182 | if field.nil? |
... | ... | @@ -288,7 +298,13 @@ |
288 | 298 | class IndriPrintedDocuments < String |
289 | 299 | |
290 | 300 | def extract_docs |
291 | - self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? } | |
301 | + self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? } | |
302 | + end | |
303 | + | |
304 | + def extract_docs_score | |
305 | + score = self.scan(/\d+ Q0 .+ \d+ (-\d+.\d+) .+/).flatten | |
306 | + name = self.scan(/\d+ Q0 (.+) \d+ -\d+.\d+ .+/).collect { |n| n.first.scan(/(\d+).xml/).first } | |
307 | + return self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? },score,name | |
292 | 308 | end |
293 | 309 | end |
294 | 310 | end |
main.rb
... | ... | @@ -3,11 +3,27 @@ |
3 | 3 | require 'mirimiri' |
4 | 4 | require "benchmark" |
5 | 5 | |
6 | +# Fetch the text content of two Wikipedia pages using their URLs | |
6 | 7 | w = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan") |
8 | +u = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/Pantera") | |
9 | + | |
10 | +# Compute the entropy of a word sequence, using `w` as context | |
7 | 11 | p w.entropy("dillinger escape plan") |
8 | 12 | p w.tf("guitar") |
9 | 13 | |
14 | +# Compute the KL-Divergence between the two pages | |
15 | +p w.kl u | |
16 | + | |
17 | + | |
18 | +# Mirimiri also comprises Indri-related classes | |
19 | + | |
20 | +# Building an Indri query | |
10 | 21 | query = Indri::IndriQuery.new({:query => "dillinger escape plan".sequential_dependence_model, :count => 10}, "-trecFormat=true -printDocuments=true") |
22 | + | |
23 | +# Initializing the index on which the query will be executed | |
24 | +# Must have been previously built using `IndriBuildIndex` | |
11 | 25 | index = Indri::IndriIndex.new "/mnt/disk1/ClueWeb09_English_1noSpam" |
26 | + | |
27 | +# Run the query on the index and fetch the text of the documents | |
12 | 28 | s = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8")) |