Commit b0ffa2ad49638e2a223fff528de1a4ad336acb72

Authored by Romain Deveaud
1 parent b3c0213975
Exists in master

finally committing some recent changes

Showing 7 changed files with 146 additions and 16 deletions Side-by-side Diff

1 1 # mirimiri
2 2  
3   -Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
  3 +The various tools of this project were developed for research purposes during
  4 +my Ph.D. and heavily rely on the use of Indri (<http://lemurproject.org/indri.php>).
  5 +Setting up Ruby is not as painful as it used to be since RVM (<https://rvm.io/>),
  6 +visit at least these two websites before trying to use `mirimiri`.
  7 +
  8 +
  9 +Copyright (C) 2010-2013 Romain Deveaud <romain.deveaud@gmail.com>
4 10  
5 11 > The Fijian monkey-faced bat (Mirimiri acrodonta), also called the Fiji
6 12 > Flying Fox, is an Old World fruit bat endemic to Fiji. It was discovered
lib/mirimiri/document.rb
... ... @@ -25,7 +25,7 @@
25 25  
26 26 # A Document is a bag of words and is constructed from a string.
27 27 class Document
28   - attr_reader :words, :doc_content, :count_words
  28 + attr_reader :words, :doc_content, :xcount
29 29  
30 30 # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
31 31 # and the \\W special escape).
... ... @@ -36,7 +36,7 @@
36 36  
37 37 @doc_content.split.each do |w|
38 38 w.split(/\W/).each do |sw|
39   - wo.push(sw.downcase) if sw =~ /[a-zA-Z]/
  39 + wo.push(sw.downcase) if sw =~ /[[:alpha:]]/
40 40 end
41 41 end
42 42  
... ... @@ -80,7 +80,7 @@
80 80 en = 0.0
81 81  
82 82 s.split.each do |w|
83   - p_wi = @count_words[w].to_f/@words.count.to_f
  83 + p_wi = @xcount[w].to_f/@words.count.to_f
84 84 en += p_wi*Math.log2(p_wi)
85 85 end
86 86  
... ... @@ -101,7 +101,7 @@
101 101 size = s.split.size
102 102  
103 103 if size == 1
104   - p_wi = @count_words[s].to_f/@words.count.to_f
  104 + p_wi = @xcount[s].to_f/@words.count.to_f
105 105 en += p_wi*Math.log(p_wi)
106 106 elsif size > 1
107 107 ng_size = ngrams(size)
108 108  
109 109  
110 110  
... ... @@ -117,14 +117,28 @@
117 117 #
118 118 # tf("guitar") #=> 0.000380372765310004
119 119 def tf(s)
120   - @count_words[s].to_f/@words.size.to_f
  120 + @xcount[s].to_f/@words.size.to_f
121 121 end
122 122  
  123 + # Computes the KL divergence between the language model of the +self+
  124 + # and the language model of +doc+.
  125 + #
  126 + # KL is not symmetric, see http://en.wikipedia.org/wiki/Kullback-Leibler_divergence
  127 + # for more information.
  128 + #
  129 + # d1.kl(d2) #=> 0.2971808085725761
  130 + def kl(doc)
  131 + raise ArgumentError, 'Argument is not a Mirimiri::Document' unless doc.is_a? Mirimiri::Document
  132 +
  133 + vocab = self.words & doc.words
123 134  
  135 + vocab.inject(0.0) { |res,w| res + self.tf(w)*Math.log(self.tf(w)/doc.tf(w)) }
  136 + end
  137 +
124 138 def initialize(content="")
125 139 @doc_content = content
126 140 @words = format_words
127   - @count_words = count_words
  141 + @xcount = count_words
128 142 @ngrams = {}
129 143 end
130 144  
... ... @@ -149,7 +163,7 @@
149 163  
150 164 @url = url
151 165 content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")
152   - super Sanitize.clean(content.unaccent.toutf8.force_encoding("UTF-8"), :remove_contents => ['script'])
  166 + super Sanitize.clean(content, :remove_contents => ['script','style'])
153 167 end
154 168 end
155 169  
156 170  
... ... @@ -161,9 +175,9 @@
161 175  
162 176  
163 177 def self.search_wikipedia_titles(name)
164   - raise ArgumentError, "Bad encoding", name unless name.isutf8
  178 +# raise ArgumentError, "Bad encoding", name unless name.isutf8
165 179  
166   - res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).unaccent.toutf8).elements['api/query/search']
  180 + res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&srlimit=20&format=xml" ).force_encoding("ISO-8859-1").encode("UTF-8")).elements['api/query/search']
167 181  
168 182 res.collect { |e| e.attributes['title'] } unless res.nil?
169 183 end
lib/mirimiri/index.rb
... ... @@ -32,7 +32,7 @@
32 32 end
33 33  
34 34 def runquery indriquery
35   - raise ArgumentError, 'Argument is not an IndriQuery' unless indriquery.is_a? Indri::IndriQuery
  35 + raise ArgumentError, 'Argument is not an Indri::IndriQuery' unless indriquery.is_a? Indri::IndriQuery
36 36  
37 37 query = "IndriRunQuery -query=\"#{indriquery.query}\" -index=#{@path}"
38 38  
lib/mirimiri/query.rb
... ... @@ -20,6 +20,9 @@
20 20 #++
21 21  
22 22 class Query
  23 + attr_accessor :query
  24 +
  25 +
23 26 end
24 27  
25 28 module Indri
... ... @@ -27,7 +30,7 @@
27 30 class Parameters
28 31 attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline
29 32  
30   - def initialize(corpus,count="1000",mem="1g",threads="1",offset="1",run_id="default",print_query=false,print_docs=false)
  33 + def initialize(corpus,count="1000",mem="1g",threads="1",offset="1",run_id="default",print_passages=false,print_query=false,print_docs=false)
31 34 @index_path = corpus
32 35 @memory = mem
33 36 @count = count
34 37  
... ... @@ -36,11 +39,15 @@
36 39 @run_id = run_id
37 40 @print_query = print_query ? "true" : "false"
38 41 @print_docs = print_docs ? "true" : "false"
  42 + @print_passages = print_passages ? "true" : "false"
  43 + @indexes = [corpus]
39 44 end
40 45  
41 46 def to_s
42 47 h = "<memory>#{@memory}</memory>\n"
43   - h += "<index>#{@index_path}</index>\n"
  48 + @indexes.each do |i|
  49 + h += "<index>#{i}</index>\n"
  50 + end
44 51 h += "<count>#{@count}</count>\n"
45 52 h += "<threads>#{@threads}</threads>\n"
46 53 unless @baseline.nil?
47 54  
... ... @@ -51,11 +58,16 @@
51 58 h += "<trecFormat>true</trecFormat>\n"
52 59 h += "<queryOffset>#{@offset}</queryOffset>\n"
53 60 h += "<runID>#{@run_id}</runID>\n"
  61 + h += "<printPassages>#{@print_passages}</printPassages>\n"
54 62 h += "<printQuery>#{@print_query}</printQuery>\n"
55 63 h += "<printDocuments>#{@print_docs}</printDocuments>\n"
56 64  
57 65 h
58 66 end
  67 +
  68 + def add_index path
  69 + @indexes << path
  70 + end
59 71 end
60 72  
61 73 class IndriQueryOld < Query
... ... @@ -91,6 +103,10 @@
91 103  
92 104 raise ArgumentError, 'Argument 2 must be a String' unless (args.is_a?(String) || args.nil?)
93 105 @args = args
  106 + end
  107 +
  108 + def clarity index_path,terms=10,documents=5
  109 + `clarity -index=#{index_path} -documents=#{documents} -terms=#{terms} -smoothing=\"method:#{@sm_method},#{@sm_param}:#{@sm_value}\" -query=\"#{query}\"`.split("=").last.strip
94 110 end
95 111 end
96 112  
lib/mirimiri/result.rb
  1 +#!/usr/bin/env ruby
  2 +
  3 +#--
  4 +# This file is a part of the mirimiri library
  5 +#
  6 +# Copyright (C) 2010-2012 Romain Deveaud <romain.deveaud@gmail.com>
  7 +#
  8 +# This program is free software: you can redistribute it and/or modify
  9 +# it under the terms of the GNU General Public License as published by
  10 +# the Free Software Foundation, either version 3 of the License, or
  11 +# (at your option) any later version.
  12 +#
  13 +# This program is distributed in the hope that it will be useful,
  14 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 +# GNU General Public License for more details.
  17 +#
  18 +# You should have received a copy of the GNU General Public License
  19 +# along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +#++
  21 +
  22 +module Mirimiri
  23 +
  24 + # This class represents one line of a TREC-formatted retrieval
  25 + # result. Typical output of Indri or Terrier.
  26 + class TrecResult
  27 + attr_accessor :topic, :doc, :rank, :score, :run
  28 +
  29 + def initialize arg
  30 + t = arg.split
  31 + @topic = t[0]
  32 + @doc = t[2]
  33 + @rank = t[3]
  34 + @score = t[4]
  35 + @run = t[5]
  36 + end
  37 + end
  38 +
  39 + # This class represents the output of trec_eval, when
  40 + # -q option is given.
  41 + class TrecEval
  42 + attr_accessor :metric, :run, :queries
  43 +
  44 + def initialize arg
  45 + @queries = {}
  46 +
  47 + arg.each_line do |line|
  48 + t = line.split
  49 + @metric = t[0] if @metric.nil?
  50 + @queries[t[1]] = t[2].to_f if t[1].is_integer?
  51 + end
  52 + end
  53 + end
  54 +
  55 + # An array of TrecResult, or a run.
  56 + class TrecResults < Array
  57 +
  58 + def initialize args
  59 + super args.collect { |res| TrecResult.new res }
  60 + end
  61 + end
  62 +end
lib/mirimiri/string.rb
... ... @@ -67,7 +67,8 @@
67 67 "whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will",
68 68 "wilt","with","within","without","worse","worst","would","wow","ye","yet","year",
69 69 "yippee","you","your","yours","yourself","yourselves",
70   - "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en","html"
  70 + "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en","html",
  71 + "amp","nbsp","quot"
71 72 ]
72 73  
73 74 Transmap = {
... ... @@ -158,6 +159,7 @@
158 159  
159 160 def unaccent
160 161 # force_encoding is needed with ruby1.9
  162 +# Transmap.inject(self) { |str, (utf8, asc)| str.gsub(utf8, asc) }
161 163 Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) }
162 164 end
163 165  
... ... @@ -166,7 +168,15 @@
166 168 self.split.all? { |e| Stoplist.include?(e.downcase) }
167 169 end
168 170  
169   - def sequential_dependence_model t=0.85,o=0.10,u=0.05,field=nil
  171 + def is_integer?
  172 + !self.empty? && self =~ /\A\d+\Z/
  173 + end
  174 +
  175 + def numeric?
  176 + Float(self) != nil rescue false
  177 + end
  178 +
  179 + def sequential_dependence_model field=nil,t=0.85,o=0.10,u=0.05
170 180 d = Mirimiri::Document.new self
171 181  
172 182 if field.nil?
... ... @@ -288,7 +298,13 @@
288 298 class IndriPrintedDocuments < String
289 299  
290 300 def extract_docs
291   - self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? }
  301 + self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? }
  302 + end
  303 +
  304 + def extract_docs_score
  305 + score = self.scan(/\d+ Q0 .+ \d+ (-\d+.\d+) .+/).flatten
  306 + name = self.scan(/\d+ Q0 (.+) \d+ -\d+.\d+ .+/).collect { |n| n.first.scan(/(\d+).xml/).first }
  307 + return self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? },score,name
292 308 end
293 309 end
294 310 end
... ... @@ -3,11 +3,27 @@
3 3 require 'mirimiri'
4 4 require "benchmark"
5 5  
  6 +# Fetch the text content of two Wikipedia pages using their URLs
6 7 w = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan")
  8 +u = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/Pantera")
  9 +
  10 +# Compute the entropy of a word sequence, using `w` as context
7 11 p w.entropy("dillinger escape plan")
8 12 p w.tf("guitar")
9 13  
  14 +# Compute the KL-Divergence between the two pages
  15 +p w.kl u
  16 +
  17 +
  18 +# Mirimiri also comprises Indri-related classes
  19 +
  20 +# Building an Indri query
10 21 query = Indri::IndriQuery.new({:query => "dillinger escape plan".sequential_dependence_model, :count => 10}, "-trecFormat=true -printDocuments=true")
  22 +
  23 +# Initializing the index on which the query will be executed
  24 +# Must have been previously built using `IndriBuildIndex`
11 25 index = Indri::IndriIndex.new "/mnt/disk1/ClueWeb09_English_1noSpam"
  26 +
  27 +# Run the query on the index and fetch the text of the documents
12 28 s = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))