diff --git a/README.markdown b/README.markdown index 8aa5416..5a601e4 100644 --- a/README.markdown +++ b/README.markdown @@ -1,6 +1,12 @@ # mirimiri -Copyright (C) 2010-2011 Romain Deveaud +The various tools of this project were developed for research purposes during +my Ph.D. and heavily rely on the use of Indri (). +Setting up Ruby is not as painful as it used to be since RVM (), +visit at least these two websites before trying to use `mirimiri`. + + +Copyright (C) 2010-2013 Romain Deveaud > The Fijian monkey-faced bat (Mirimiri acrodonta), also called the Fiji > Flying Fox, is an Old World fruit bat endemic to Fiji. It was discovered diff --git a/lib/mirimiri/document.rb b/lib/mirimiri/document.rb index 8a7aa31..b738e0c 100644 --- a/lib/mirimiri/document.rb +++ b/lib/mirimiri/document.rb @@ -25,7 +25,7 @@ module Mirimiri # A Document is a bag of words and is constructed from a string. class Document - attr_reader :words, :doc_content, :count_words + attr_reader :words, :doc_content, :xcount # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html # and the \\W special escape). @@ -36,7 +36,7 @@ module Mirimiri @doc_content.split.each do |w| w.split(/\W/).each do |sw| - wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ + wo.push(sw.downcase) if sw =~ /[[:alpha:]]/ end end @@ -80,7 +80,7 @@ module Mirimiri en = 0.0 s.split.each do |w| - p_wi = @count_words[w].to_f/@words.count.to_f + p_wi = @xcount[w].to_f/@words.count.to_f en += p_wi*Math.log2(p_wi) end @@ -101,7 +101,7 @@ module Mirimiri size = s.split.size if size == 1 - p_wi = @count_words[s].to_f/@words.count.to_f + p_wi = @xcount[s].to_f/@words.count.to_f en += p_wi*Math.log(p_wi) elsif size > 1 ng_size = ngrams(size) @@ -117,14 +117,28 @@ module Mirimiri # # tf("guitar") #=> 0.000380372765310004 def tf(s) - @count_words[s].to_f/@words.size.to_f + @xcount[s].to_f/@words.size.to_f end + # Computes the KL divergence between the language model of the +self+ + # and the language model of +doc+. + # + # KL is not symmetric, see http://en.wikipedia.org/wiki/Kullback-Leibler_divergence + # for more information. + # + # d1.kl(d2) #=> 0.2971808085725761 + def kl(doc) + raise ArgumentError, 'Argument is not a Mirimiri::Document' unless doc.is_a? Mirimiri::Document + + vocab = self.words & doc.words + + vocab.inject(0.0) { |res,w| res + self.tf(w)*Math.log(self.tf(w)/doc.tf(w)) } + end def initialize(content="") @doc_content = content @words = format_words - @count_words = count_words + @xcount = count_words @ngrams = {} end @@ -149,7 +163,7 @@ module Mirimiri @url = url content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("") - super Sanitize.clean(content.unaccent.toutf8.force_encoding("UTF-8"), :remove_contents => ['script']) + super Sanitize.clean(content, :remove_contents => ['script','style']) end end @@ -161,9 +175,9 @@ module Mirimiri def self.search_wikipedia_titles(name) - raise ArgumentError, "Bad encoding", name unless name.isutf8 +# raise ArgumentError, "Bad encoding", name unless name.isutf8 - res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).unaccent.toutf8).elements['api/query/search'] + res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&srlimit=20&format=xml" ).force_encoding("ISO-8859-1").encode("UTF-8")).elements['api/query/search'] res.collect { |e| e.attributes['title'] } unless res.nil? end diff --git a/lib/mirimiri/index.rb b/lib/mirimiri/index.rb index 0a89694..1caeb09 100644 --- a/lib/mirimiri/index.rb +++ b/lib/mirimiri/index.rb @@ -32,7 +32,7 @@ module Indri end def runquery indriquery - raise ArgumentError, 'Argument is not an IndriQuery' unless indriquery.is_a? Indri::IndriQuery + raise ArgumentError, 'Argument is not an Indri::IndriQuery' unless indriquery.is_a? Indri::IndriQuery query = "IndriRunQuery -query=\"#{indriquery.query}\" -index=#{@path}" diff --git a/lib/mirimiri/query.rb b/lib/mirimiri/query.rb index 4f038b0..7f9f02d 100644 --- a/lib/mirimiri/query.rb +++ b/lib/mirimiri/query.rb @@ -20,6 +20,9 @@ #++ class Query + attr_accessor :query + + end module Indri @@ -27,7 +30,7 @@ module Indri class Parameters attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline - def initialize(corpus,count="1000",mem="1g",threads="1",offset="1",run_id="default",print_query=false,print_docs=false) + def initialize(corpus,count="1000",mem="1g",threads="1",offset="1",run_id="default",print_passages=false,print_query=false,print_docs=false) @index_path = corpus @memory = mem @count = count @@ -36,11 +39,15 @@ module Indri @run_id = run_id @print_query = print_query ? "true" : "false" @print_docs = print_docs ? "true" : "false" + @print_passages = print_passages ? "true" : "false" + @indexes = [corpus] end def to_s h = "#{@memory}\n" - h += "#{@index_path}\n" + @indexes.each do |i| + h += "#{i}\n" + end h += "#{@count}\n" h += "#{@threads}\n" unless @baseline.nil? @@ -51,11 +58,16 @@ module Indri h += "true\n" h += "#{@offset}\n" h += "#{@run_id}\n" + h += "#{@print_passages}\n" h += "#{@print_query}\n" h += "#{@print_docs}\n" h end + + def add_index path + @indexes << path + end end class IndriQueryOld < Query @@ -92,6 +104,10 @@ module Indri raise ArgumentError, 'Argument 2 must be a String' unless (args.is_a?(String) || args.nil?) @args = args end + + def clarity index_path,terms=10,documents=5 + `clarity -index=#{index_path} -documents=#{documents} -terms=#{terms} -smoothing=\"method:#{@sm_method},#{@sm_param}:#{@sm_value}\" -query=\"#{query}\"`.split("=").last.strip + end end class IndriQueries diff --git a/lib/mirimiri/result.rb b/lib/mirimiri/result.rb new file mode 100644 index 0000000..71d0776 --- /dev/null +++ b/lib/mirimiri/result.rb @@ -0,0 +1,62 @@ +#!/usr/bin/env ruby + +#-- +# This file is a part of the mirimiri library +# +# Copyright (C) 2010-2012 Romain Deveaud +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +#++ + +module Mirimiri + + # This class represents one line of a TREC-formatted retrieval + # result. Typical output of Indri or Terrier. + class TrecResult + attr_accessor :topic, :doc, :rank, :score, :run + + def initialize arg + t = arg.split + @topic = t[0] + @doc = t[2] + @rank = t[3] + @score = t[4] + @run = t[5] + end + end + + # This class represents the output of trec_eval, when + # -q option is given. + class TrecEval + attr_accessor :metric, :run, :queries + + def initialize arg + @queries = {} + + arg.each_line do |line| + t = line.split + @metric = t[0] if @metric.nil? + @queries[t[1]] = t[2].to_f if t[1].is_integer? + end + end + end + + # An array of TrecResult, or a run. + class TrecResults < Array + + def initialize args + super args.collect { |res| TrecResult.new res } + end + end +end diff --git a/lib/mirimiri/string.rb b/lib/mirimiri/string.rb index b94fa28..212823d 100644 --- a/lib/mirimiri/string.rb +++ b/lib/mirimiri/string.rb @@ -67,7 +67,8 @@ module Mirimiri "whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will", "wilt","with","within","without","worse","worst","would","wow","ye","yet","year", "yippee","you","your","yours","yourself","yourselves", - "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en","html" + "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en","html", + "amp","nbsp","quot" ] Transmap = { @@ -158,6 +159,7 @@ class String def unaccent # force_encoding is needed with ruby1.9 +# Transmap.inject(self) { |str, (utf8, asc)| str.gsub(utf8, asc) } Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) } end @@ -166,7 +168,15 @@ class String self.split.all? { |e| Stoplist.include?(e.downcase) } end - def sequential_dependence_model t=0.85,o=0.10,u=0.05,field=nil + def is_integer? + !self.empty? && self =~ /\A\d+\Z/ + end + + def numeric? + Float(self) != nil rescue false + end + + def sequential_dependence_model field=nil,t=0.85,o=0.10,u=0.05 d = Mirimiri::Document.new self if field.nil? @@ -288,7 +298,13 @@ module Indri class IndriPrintedDocuments < String def extract_docs - self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? } + self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? } + end + + def extract_docs_score + score = self.scan(/\d+ Q0 .+ \d+ (-\d+.\d+) .+/).flatten + name = self.scan(/\d+ Q0 (.+) \d+ -\d+.\d+ .+/).collect { |n| n.first.scan(/(\d+).xml/).first } + return self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? },score,name end end end diff --git a/main.rb b/main.rb index 170caa3..114056a 100644 --- a/main.rb +++ b/main.rb @@ -3,10 +3,26 @@ $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib")) require 'mirimiri' require "benchmark" +# Fetch the text content of two Wikipedia pages using their URLs w = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan") +u = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/Pantera") + +# Compute the entropy of a word sequence, using `w` as context p w.entropy("dillinger escape plan") p w.tf("guitar") +# Compute the KL-Divergence between the two pages +p w.kl u + + +# Mirimiri also comprises Indri-related classes + +# Building an Indri query query = Indri::IndriQuery.new({:query => "dillinger escape plan".sequential_dependence_model, :count => 10}, "-trecFormat=true -printDocuments=true") + +# Initializing the index on which the query will be executed +# Must have been previously built using `IndriBuildIndex` index = Indri::IndriIndex.new "/mnt/disk1/ClueWeb09_English_1noSpam" + +# Run the query on the index and fetch the text of the documents s = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))