From e0e33fca06e4913aefe250d4d9458464a250052e Mon Sep 17 00:00:00 2001 From: Romain Deveaud Date: Thu, 8 Mar 2012 08:50:39 +0100 Subject: [PATCH] new way of querying indri. entropy of n-grams. sdm is now part of the string class. --- lib/mirimiri.rb | 2 ++ lib/mirimiri/document.rb | 34 +++++++++++++++++++++++++++------- lib/mirimiri/index.rb | 15 ++++++++++++--- lib/mirimiri/query.rb | 21 ++++++++++++++++----- lib/mirimiri/string.rb | 35 ++++++++++++++++++++++++++++++++++- main.rb | 4 ++++ 6 files changed, 95 insertions(+), 16 deletions(-) diff --git a/lib/mirimiri.rb b/lib/mirimiri.rb index 05141df..f3d8d2d 100644 --- a/lib/mirimiri.rb +++ b/lib/mirimiri.rb @@ -2,7 +2,9 @@ require 'mirimiri/document' require 'mirimiri/string' +require 'mirimiri/result' require 'mirimiri/query' +require 'mirimiri/index' require 'mirimiri/corpus' require 'mirimiri/regexp' require 'mirimiri/ttagger' diff --git a/lib/mirimiri/document.rb b/lib/mirimiri/document.rb index 30cdc00..1e98851 100644 --- a/lib/mirimiri/document.rb +++ b/lib/mirimiri/document.rb @@ -58,7 +58,7 @@ module Mirimiri end end - ngrams_array.uniq + ngrams_array end # Returns a Hash containing the words and their associated counts in the current Document. @@ -71,19 +71,39 @@ module Mirimiri counts end + # Old entropy function. + # TODO: remove. + def entropy0(s) + en = 0.0 + + s.split.each do |w| + p_wi = @count_words[w].to_f/@words.count.to_f + en += p_wi*Math.log2(p_wi) + end + + en *= -1 + en + end + # Computes the entropy of a given string +s+ inside the document. # # If the string parameter is composed of many words (i.e. tokens separated # by whitespace(s)), it is considered as an ngram. # - # entropy("guitar") #=> 0.00432114812727959 - # entropy("dillinger escape plan") #=> 0.265862076325102 + # entropy("guitar") #=> 0.014348983965324762 + # entropy("dillinger escape plan") #=> 0.054976093116768154 def entropy(s) en = 0.0 - - s.split.each do |w| - p_wi = @count_words[w].to_f/@words.count.to_f - en += p_wi*Math.log2(p_wi) + + size = s.split.size + + if size == 1 + p_wi = @count_words[s].to_f/@words.count.to_f + en += p_wi*Math.log(p_wi) + elsif size > 1 + ng_size = ngrams(size) + p_wi = ng_size.count(s).to_f/ng_size.count.to_f + en += p_wi*Math.log(p_wi) end en *= -1 diff --git a/lib/mirimiri/index.rb b/lib/mirimiri/index.rb index 7316734..0a89694 100644 --- a/lib/mirimiri/index.rb +++ b/lib/mirimiri/index.rb @@ -24,16 +24,25 @@ end module Indri - class IndriIndex + class IndriIndex < Index - def exec indriquery + def initialize path + raise ArgumentError, 'Index path does not exist' unless File.directory? path + @path = path + end + + def runquery indriquery raise ArgumentError, 'Argument is not an IndriQuery' unless indriquery.is_a? Indri::IndriQuery - query = "IndriRunQuery -query#{indriquery.query} -index=#{@path}" + query = "IndriRunQuery -query=\"#{indriquery.query}\" -index=#{@path}" query += " -count=#{indriquery.count}" unless indriquery.count.nil? query += " -rule=method:#{indriquery.sm_method},#{indriquery.sm_param}:#{indriquery.sm_value}" unless indriquery.sm_method.nil? query += " #{indriquery.args}" unless indriquery.args.nil? + + res = `#{query}` + + res end end end diff --git a/lib/mirimiri/query.rb b/lib/mirimiri/query.rb index 1fe5cb8..4f038b0 100644 --- a/lib/mirimiri/query.rb +++ b/lib/mirimiri/query.rb @@ -84,12 +84,12 @@ module Indri attr_accessor :query, :count, :sm_method, :sm_param, :sm_value, :args def initialize atts={},args=nil - raise ArgumentError, 'Argument 1 must be a Hash' unless args.is_a? Hash + raise ArgumentError, 'Argument 1 must be a Hash' unless atts.is_a? Hash atts.each do |k,v| instance_variable_set("@#{k}", v) unless v.nil? end - raise ArgumentError, 'Argument 2 must be a String' unless args.is_a? String + raise ArgumentError, 'Argument 2 must be a String' unless (args.is_a?(String) || args.nil?) @args = args end end @@ -97,20 +97,31 @@ module Indri class IndriQueries attr_accessor :params, :queries - def initialize(params,*queries) - @queries = queries + def initialize params +# @queries = queries @params = params + @queries = {} # Here we set the default retrieval model as Language Modeling # with a Dirichlet smoothing at 2500. # TODO: maybe a Rule class... @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? end + def push id,query + @queries[id.to_i] = query + end + def to_s h = "\n" h += @params.to_s - h += @queries.collect { |q| q.to_s }.join "" + h += @queries.sort { |a,b| a[0] <=> b[0] }.collect do |q| + "\n" + + "#{q[0]}\n" + + "#{q[1]}\n" + + "\n" + end.join "" +# h += @queries.collect { |q| q.to_s }.join "" h += "" h diff --git a/lib/mirimiri/string.rb b/lib/mirimiri/string.rb index 18704e2..754e53a 100644 --- a/lib/mirimiri/string.rb +++ b/lib/mirimiri/string.rb @@ -161,11 +161,35 @@ class String Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) } end - # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise. + # Returns +true+ if +self+ belongs to Mirimiri::Stoplist, +false+ otherwise. def is_stopword? Stoplist.include?(self.downcase) end + def sequential_dependence_model t=0.85,o=0.10,u=0.05,field=nil + d = Mirimiri::Document.new self + + if field.nil? + ematch = d.ngrams(2).collect { |ng| "#1(#{ng})" } + pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng})" } + else + ematch = d.ngrams(2).collect { |ng| "#1(#{ng}).(#{field})" } + pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng}).(#{field})" } + end + + if ematch.empty? + if field.nil? + ematch = d.words.collect { |ng| "#1(#{ng})" } + pmatch = d.words.collect { |ng| "#uw8(#{ng})" } + else + ematch = d.words.collect { |ng| "#1(#{ng}).(#{field})" } + pmatch = d.words.collect { |ng| "#uw8(#{ng}).(#{field})" } + end + end + + "#weight ( #{t} #combine( #{self} ) #{o} #combine ( #{ematch.join(" ")} ) #{u} #combine ( #{pmatch.join(" ")} ) )" + end + # Do not use. # TODO: rewamp. find why this function is here. def remove_special_characters @@ -259,3 +283,12 @@ class String private :strip_with_pattern end + +module Indri + class IndriPrintedDocuments < String + + def extract_docs + self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? } + end + end +end diff --git a/main.rb b/main.rb index cd47a26..c1efea4 100644 --- a/main.rb +++ b/main.rb @@ -5,3 +5,7 @@ require 'mirimiri' w = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan") p w.entropy("dillinger escape plan") p w.tf("guitar") + +query = Indri::IndriQuery.new({:query => "dillinger escape plan".sequential_dependence_model, :count => 10}, "-trecFormat=true -printDocuments=true") +index = Indri::IndriIndex.new "/mnt/disk1/ClueWeb09_English_1noSpam" +s = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8")) -- 1.8.2.3