Commit e0e33fca06e4913aefe250d4d9458464a250052e

Authored by Romain Deveaud
1 parent aa386f5530
Exists in master

new way of querying indri. entropy of n-grams. sdm is now part of the string class.

Showing 6 changed files with 95 additions and 16 deletions Side-by-side Diff

... ... @@ -2,7 +2,9 @@
2 2  
3 3 require 'mirimiri/document'
4 4 require 'mirimiri/string'
  5 +require 'mirimiri/result'
5 6 require 'mirimiri/query'
  7 +require 'mirimiri/index'
6 8 require 'mirimiri/corpus'
7 9 require 'mirimiri/regexp'
8 10 require 'mirimiri/ttagger'
lib/mirimiri/document.rb
... ... @@ -58,7 +58,7 @@
58 58 end
59 59 end
60 60  
61   - ngrams_array.uniq
  61 + ngrams_array
62 62 end
63 63  
64 64 # Returns a Hash containing the words and their associated counts in the current Document.
65 65  
66 66  
... ... @@ -71,19 +71,39 @@
71 71 counts
72 72 end
73 73  
  74 + # Old entropy function.
  75 + # TODO: remove.
  76 + def entropy0(s)
  77 + en = 0.0
  78 +
  79 + s.split.each do |w|
  80 + p_wi = @count_words[w].to_f/@words.count.to_f
  81 + en += p_wi*Math.log2(p_wi)
  82 + end
  83 +
  84 + en *= -1
  85 + en
  86 + end
  87 +
74 88 # Computes the entropy of a given string +s+ inside the document.
75 89 #
76 90 # If the string parameter is composed of many words (i.e. tokens separated
77 91 # by whitespace(s)), it is considered as an ngram.
78 92 #
79   - # entropy("guitar") #=> 0.00432114812727959
80   - # entropy("dillinger escape plan") #=> 0.265862076325102
  93 + # entropy("guitar") #=> 0.014348983965324762
  94 + # entropy("dillinger escape plan") #=> 0.054976093116768154
81 95 def entropy(s)
82 96 en = 0.0
83   -
84   - s.split.each do |w|
85   - p_wi = @count_words[w].to_f/@words.count.to_f
86   - en += p_wi*Math.log2(p_wi)
  97 +
  98 + size = s.split.size
  99 +
  100 + if size == 1
  101 + p_wi = @count_words[s].to_f/@words.count.to_f
  102 + en += p_wi*Math.log(p_wi)
  103 + elsif size > 1
  104 + ng_size = ngrams(size)
  105 + p_wi = ng_size.count(s).to_f/ng_size.count.to_f
  106 + en += p_wi*Math.log(p_wi)
87 107 end
88 108  
89 109 en *= -1
lib/mirimiri/index.rb
... ... @@ -24,16 +24,25 @@
24 24  
25 25 module Indri
26 26  
27   - class IndriIndex
  27 + class IndriIndex < Index
28 28  
29   - def exec indriquery
  29 + def initialize path
  30 + raise ArgumentError, 'Index path does not exist' unless File.directory? path
  31 + @path = path
  32 + end
  33 +
  34 + def runquery indriquery
30 35 raise ArgumentError, 'Argument is not an IndriQuery' unless indriquery.is_a? Indri::IndriQuery
31 36  
32   - query = "IndriRunQuery -query#{indriquery.query} -index=#{@path}"
  37 + query = "IndriRunQuery -query=\"#{indriquery.query}\" -index=#{@path}"
33 38  
34 39 query += " -count=#{indriquery.count}" unless indriquery.count.nil?
35 40 query += " -rule=method:#{indriquery.sm_method},#{indriquery.sm_param}:#{indriquery.sm_value}" unless indriquery.sm_method.nil?
36 41 query += " #{indriquery.args}" unless indriquery.args.nil?
  42 +
  43 + res = `#{query}`
  44 +
  45 + res
37 46 end
38 47 end
39 48 end
lib/mirimiri/query.rb
... ... @@ -84,12 +84,12 @@
84 84 attr_accessor :query, :count, :sm_method, :sm_param, :sm_value, :args
85 85  
86 86 def initialize atts={},args=nil
87   - raise ArgumentError, 'Argument 1 must be a Hash' unless args.is_a? Hash
  87 + raise ArgumentError, 'Argument 1 must be a Hash' unless atts.is_a? Hash
88 88 atts.each do |k,v|
89 89 instance_variable_set("@#{k}", v) unless v.nil?
90 90 end
91 91  
92   - raise ArgumentError, 'Argument 2 must be a String' unless args.is_a? String
  92 + raise ArgumentError, 'Argument 2 must be a String' unless (args.is_a?(String) || args.nil?)
93 93 @args = args
94 94 end
95 95 end
96 96  
97 97  
98 98  
... ... @@ -97,20 +97,31 @@
97 97 class IndriQueries
98 98 attr_accessor :params, :queries
99 99  
100   - def initialize(params,*queries)
101   - @queries = queries
  100 + def initialize params
  101 +# @queries = queries
102 102  
103 103 @params = params
  104 + @queries = {}
104 105 # Here we set the default retrieval model as Language Modeling
105 106 # with a Dirichlet smoothing at 2500.
106 107 # TODO: maybe a Rule class...
107 108 @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil?
108 109 end
109 110  
  111 + def push id,query
  112 + @queries[id.to_i] = query
  113 + end
  114 +
110 115 def to_s
111 116 h = "<parameters>\n"
112 117 h += @params.to_s
113   - h += @queries.collect { |q| q.to_s }.join ""
  118 + h += @queries.sort { |a,b| a[0] <=> b[0] }.collect do |q|
  119 + "<query>\n" +
  120 + "<number>#{q[0]}</number>\n" +
  121 + "<text>#{q[1]}</text>\n" +
  122 + "</query>\n"
  123 + end.join ""
  124 +# h += @queries.collect { |q| q.to_s }.join ""
114 125 h += "</parameters>"
115 126  
116 127 h
lib/mirimiri/string.rb
... ... @@ -161,11 +161,35 @@
161 161 Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) }
162 162 end
163 163  
164   - # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise.
  164 + # Returns +true+ if +self+ belongs to Mirimiri::Stoplist, +false+ otherwise.
165 165 def is_stopword?
166 166 Stoplist.include?(self.downcase)
167 167 end
168 168  
  169 + def sequential_dependence_model t=0.85,o=0.10,u=0.05,field=nil
  170 + d = Mirimiri::Document.new self
  171 +
  172 + if field.nil?
  173 + ematch = d.ngrams(2).collect { |ng| "#1(#{ng})" }
  174 + pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng})" }
  175 + else
  176 + ematch = d.ngrams(2).collect { |ng| "#1(#{ng}).(#{field})" }
  177 + pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng}).(#{field})" }
  178 + end
  179 +
  180 + if ematch.empty?
  181 + if field.nil?
  182 + ematch = d.words.collect { |ng| "#1(#{ng})" }
  183 + pmatch = d.words.collect { |ng| "#uw8(#{ng})" }
  184 + else
  185 + ematch = d.words.collect { |ng| "#1(#{ng}).(#{field})" }
  186 + pmatch = d.words.collect { |ng| "#uw8(#{ng}).(#{field})" }
  187 + end
  188 + end
  189 +
  190 + "#weight ( #{t} #combine( #{self} ) #{o} #combine ( #{ematch.join(" ")} ) #{u} #combine ( #{pmatch.join(" ")} ) )"
  191 + end
  192 +
169 193 # Do not use.
170 194 # TODO: rewamp. find why this function is here.
171 195 def remove_special_characters
... ... @@ -258,5 +282,14 @@
258 282 end
259 283  
260 284 private :strip_with_pattern
  285 +end
  286 +
  287 +module Indri
  288 + class IndriPrintedDocuments < String
  289 +
  290 + def extract_docs
  291 + self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? }
  292 + end
  293 + end
261 294 end
... ... @@ -5,4 +5,8 @@
5 5 w = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan")
6 6 p w.entropy("dillinger escape plan")
7 7 p w.tf("guitar")
  8 +
  9 +query = Indri::IndriQuery.new({:query => "dillinger escape plan".sequential_dependence_model, :count => 10}, "-trecFormat=true -printDocuments=true")
  10 +index = Indri::IndriIndex.new "/mnt/disk1/ClueWeb09_English_1noSpam"
  11 +s = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))