Commit e0e33fca06e4913aefe250d4d9458464a250052e
1 parent
aa386f5530
Exists in
master
new way of querying indri. entropy of n-grams. sdm is now part of the string class.
Showing 6 changed files with 95 additions and 16 deletions Side-by-side Diff
lib/mirimiri.rb
lib/mirimiri/document.rb
| ... | ... | @@ -58,7 +58,7 @@ |
| 58 | 58 | end |
| 59 | 59 | end |
| 60 | 60 | |
| 61 | - ngrams_array.uniq | |
| 61 | + ngrams_array | |
| 62 | 62 | end |
| 63 | 63 | |
| 64 | 64 | # Returns a Hash containing the words and their associated counts in the current Document. |
| 65 | 65 | |
| 66 | 66 | |
| ... | ... | @@ -71,19 +71,39 @@ |
| 71 | 71 | counts |
| 72 | 72 | end |
| 73 | 73 | |
| 74 | + # Old entropy function. | |
| 75 | + # TODO: remove. | |
| 76 | + def entropy0(s) | |
| 77 | + en = 0.0 | |
| 78 | + | |
| 79 | + s.split.each do |w| | |
| 80 | + p_wi = @count_words[w].to_f/@words.count.to_f | |
| 81 | + en += p_wi*Math.log2(p_wi) | |
| 82 | + end | |
| 83 | + | |
| 84 | + en *= -1 | |
| 85 | + en | |
| 86 | + end | |
| 87 | + | |
| 74 | 88 | # Computes the entropy of a given string +s+ inside the document. |
| 75 | 89 | # |
| 76 | 90 | # If the string parameter is composed of many words (i.e. tokens separated |
| 77 | 91 | # by whitespace(s)), it is considered as an ngram. |
| 78 | 92 | # |
| 79 | - # entropy("guitar") #=> 0.00432114812727959 | |
| 80 | - # entropy("dillinger escape plan") #=> 0.265862076325102 | |
| 93 | + # entropy("guitar") #=> 0.014348983965324762 | |
| 94 | + # entropy("dillinger escape plan") #=> 0.054976093116768154 | |
| 81 | 95 | def entropy(s) |
| 82 | 96 | en = 0.0 |
| 83 | - | |
| 84 | - s.split.each do |w| | |
| 85 | - p_wi = @count_words[w].to_f/@words.count.to_f | |
| 86 | - en += p_wi*Math.log2(p_wi) | |
| 97 | + | |
| 98 | + size = s.split.size | |
| 99 | + | |
| 100 | + if size == 1 | |
| 101 | + p_wi = @count_words[s].to_f/@words.count.to_f | |
| 102 | + en += p_wi*Math.log(p_wi) | |
| 103 | + elsif size > 1 | |
| 104 | + ng_size = ngrams(size) | |
| 105 | + p_wi = ng_size.count(s).to_f/ng_size.count.to_f | |
| 106 | + en += p_wi*Math.log(p_wi) | |
| 87 | 107 | end |
| 88 | 108 | |
| 89 | 109 | en *= -1 |
lib/mirimiri/index.rb
| ... | ... | @@ -24,16 +24,25 @@ |
| 24 | 24 | |
| 25 | 25 | module Indri |
| 26 | 26 | |
| 27 | - class IndriIndex | |
| 27 | + class IndriIndex < Index | |
| 28 | 28 | |
| 29 | - def exec indriquery | |
| 29 | + def initialize path | |
| 30 | + raise ArgumentError, 'Index path does not exist' unless File.directory? path | |
| 31 | + @path = path | |
| 32 | + end | |
| 33 | + | |
| 34 | + def runquery indriquery | |
| 30 | 35 | raise ArgumentError, 'Argument is not an IndriQuery' unless indriquery.is_a? Indri::IndriQuery |
| 31 | 36 | |
| 32 | - query = "IndriRunQuery -query#{indriquery.query} -index=#{@path}" | |
| 37 | + query = "IndriRunQuery -query=\"#{indriquery.query}\" -index=#{@path}" | |
| 33 | 38 | |
| 34 | 39 | query += " -count=#{indriquery.count}" unless indriquery.count.nil? |
| 35 | 40 | query += " -rule=method:#{indriquery.sm_method},#{indriquery.sm_param}:#{indriquery.sm_value}" unless indriquery.sm_method.nil? |
| 36 | 41 | query += " #{indriquery.args}" unless indriquery.args.nil? |
| 42 | + | |
| 43 | + res = `#{query}` | |
| 44 | + | |
| 45 | + res | |
| 37 | 46 | end |
| 38 | 47 | end |
| 39 | 48 | end |
lib/mirimiri/query.rb
| ... | ... | @@ -84,12 +84,12 @@ |
| 84 | 84 | attr_accessor :query, :count, :sm_method, :sm_param, :sm_value, :args |
| 85 | 85 | |
| 86 | 86 | def initialize atts={},args=nil |
| 87 | - raise ArgumentError, 'Argument 1 must be a Hash' unless args.is_a? Hash | |
| 87 | + raise ArgumentError, 'Argument 1 must be a Hash' unless atts.is_a? Hash | |
| 88 | 88 | atts.each do |k,v| |
| 89 | 89 | instance_variable_set("@#{k}", v) unless v.nil? |
| 90 | 90 | end |
| 91 | 91 | |
| 92 | - raise ArgumentError, 'Argument 2 must be a String' unless args.is_a? String | |
| 92 | + raise ArgumentError, 'Argument 2 must be a String' unless (args.is_a?(String) || args.nil?) | |
| 93 | 93 | @args = args |
| 94 | 94 | end |
| 95 | 95 | end |
| 96 | 96 | |
| 97 | 97 | |
| 98 | 98 | |
| ... | ... | @@ -97,20 +97,31 @@ |
| 97 | 97 | class IndriQueries |
| 98 | 98 | attr_accessor :params, :queries |
| 99 | 99 | |
| 100 | - def initialize(params,*queries) | |
| 101 | - @queries = queries | |
| 100 | + def initialize params | |
| 101 | +# @queries = queries | |
| 102 | 102 | |
| 103 | 103 | @params = params |
| 104 | + @queries = {} | |
| 104 | 105 | # Here we set the default retrieval model as Language Modeling |
| 105 | 106 | # with a Dirichlet smoothing at 2500. |
| 106 | 107 | # TODO: maybe a Rule class... |
| 107 | 108 | @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? |
| 108 | 109 | end |
| 109 | 110 | |
| 111 | + def push id,query | |
| 112 | + @queries[id.to_i] = query | |
| 113 | + end | |
| 114 | + | |
| 110 | 115 | def to_s |
| 111 | 116 | h = "<parameters>\n" |
| 112 | 117 | h += @params.to_s |
| 113 | - h += @queries.collect { |q| q.to_s }.join "" | |
| 118 | + h += @queries.sort { |a,b| a[0] <=> b[0] }.collect do |q| | |
| 119 | + "<query>\n" + | |
| 120 | + "<number>#{q[0]}</number>\n" + | |
| 121 | + "<text>#{q[1]}</text>\n" + | |
| 122 | + "</query>\n" | |
| 123 | + end.join "" | |
| 124 | +# h += @queries.collect { |q| q.to_s }.join "" | |
| 114 | 125 | h += "</parameters>" |
| 115 | 126 | |
| 116 | 127 | h |
lib/mirimiri/string.rb
| ... | ... | @@ -161,11 +161,35 @@ |
| 161 | 161 | Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) } |
| 162 | 162 | end |
| 163 | 163 | |
| 164 | - # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise. | |
| 164 | + # Returns +true+ if +self+ belongs to Mirimiri::Stoplist, +false+ otherwise. | |
| 165 | 165 | def is_stopword? |
| 166 | 166 | Stoplist.include?(self.downcase) |
| 167 | 167 | end |
| 168 | 168 | |
| 169 | + def sequential_dependence_model t=0.85,o=0.10,u=0.05,field=nil | |
| 170 | + d = Mirimiri::Document.new self | |
| 171 | + | |
| 172 | + if field.nil? | |
| 173 | + ematch = d.ngrams(2).collect { |ng| "#1(#{ng})" } | |
| 174 | + pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng})" } | |
| 175 | + else | |
| 176 | + ematch = d.ngrams(2).collect { |ng| "#1(#{ng}).(#{field})" } | |
| 177 | + pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng}).(#{field})" } | |
| 178 | + end | |
| 179 | + | |
| 180 | + if ematch.empty? | |
| 181 | + if field.nil? | |
| 182 | + ematch = d.words.collect { |ng| "#1(#{ng})" } | |
| 183 | + pmatch = d.words.collect { |ng| "#uw8(#{ng})" } | |
| 184 | + else | |
| 185 | + ematch = d.words.collect { |ng| "#1(#{ng}).(#{field})" } | |
| 186 | + pmatch = d.words.collect { |ng| "#uw8(#{ng}).(#{field})" } | |
| 187 | + end | |
| 188 | + end | |
| 189 | + | |
| 190 | + "#weight ( #{t} #combine( #{self} ) #{o} #combine ( #{ematch.join(" ")} ) #{u} #combine ( #{pmatch.join(" ")} ) )" | |
| 191 | + end | |
| 192 | + | |
| 169 | 193 | # Do not use. |
| 170 | 194 | # TODO: rewamp. find why this function is here. |
| 171 | 195 | def remove_special_characters |
| ... | ... | @@ -258,5 +282,14 @@ |
| 258 | 282 | end |
| 259 | 283 | |
| 260 | 284 | private :strip_with_pattern |
| 285 | +end | |
| 286 | + | |
| 287 | +module Indri | |
| 288 | + class IndriPrintedDocuments < String | |
| 289 | + | |
| 290 | + def extract_docs | |
| 291 | + self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? } | |
| 292 | + end | |
| 293 | + end | |
| 261 | 294 | end |
main.rb
| ... | ... | @@ -5,4 +5,8 @@ |
| 5 | 5 | w = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan") |
| 6 | 6 | p w.entropy("dillinger escape plan") |
| 7 | 7 | p w.tf("guitar") |
| 8 | + | |
| 9 | +query = Indri::IndriQuery.new({:query => "dillinger escape plan".sequential_dependence_model, :count => 10}, "-trecFormat=true -printDocuments=true") | |
| 10 | +index = Indri::IndriIndex.new "/mnt/disk1/ClueWeb09_English_1noSpam" | |
| 11 | +s = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8")) |