Commit e0e33fca06e4913aefe250d4d9458464a250052e
1 parent
aa386f5530
Exists in
master
new way of querying indri. entropy of n-grams. sdm is now part of the string class.
Showing 6 changed files with 95 additions and 16 deletions Side-by-side Diff
lib/mirimiri.rb
lib/mirimiri/document.rb
... | ... | @@ -58,7 +58,7 @@ |
58 | 58 | end |
59 | 59 | end |
60 | 60 | |
61 | - ngrams_array.uniq | |
61 | + ngrams_array | |
62 | 62 | end |
63 | 63 | |
64 | 64 | # Returns a Hash containing the words and their associated counts in the current Document. |
65 | 65 | |
66 | 66 | |
... | ... | @@ -71,19 +71,39 @@ |
71 | 71 | counts |
72 | 72 | end |
73 | 73 | |
74 | + # Old entropy function. | |
75 | + # TODO: remove. | |
76 | + def entropy0(s) | |
77 | + en = 0.0 | |
78 | + | |
79 | + s.split.each do |w| | |
80 | + p_wi = @count_words[w].to_f/@words.count.to_f | |
81 | + en += p_wi*Math.log2(p_wi) | |
82 | + end | |
83 | + | |
84 | + en *= -1 | |
85 | + en | |
86 | + end | |
87 | + | |
74 | 88 | # Computes the entropy of a given string +s+ inside the document. |
75 | 89 | # |
76 | 90 | # If the string parameter is composed of many words (i.e. tokens separated |
77 | 91 | # by whitespace(s)), it is considered as an ngram. |
78 | 92 | # |
79 | - # entropy("guitar") #=> 0.00432114812727959 | |
80 | - # entropy("dillinger escape plan") #=> 0.265862076325102 | |
93 | + # entropy("guitar") #=> 0.014348983965324762 | |
94 | + # entropy("dillinger escape plan") #=> 0.054976093116768154 | |
81 | 95 | def entropy(s) |
82 | 96 | en = 0.0 |
83 | - | |
84 | - s.split.each do |w| | |
85 | - p_wi = @count_words[w].to_f/@words.count.to_f | |
86 | - en += p_wi*Math.log2(p_wi) | |
97 | + | |
98 | + size = s.split.size | |
99 | + | |
100 | + if size == 1 | |
101 | + p_wi = @count_words[s].to_f/@words.count.to_f | |
102 | + en += p_wi*Math.log(p_wi) | |
103 | + elsif size > 1 | |
104 | + ng_size = ngrams(size) | |
105 | + p_wi = ng_size.count(s).to_f/ng_size.count.to_f | |
106 | + en += p_wi*Math.log(p_wi) | |
87 | 107 | end |
88 | 108 | |
89 | 109 | en *= -1 |
lib/mirimiri/index.rb
... | ... | @@ -24,16 +24,25 @@ |
24 | 24 | |
25 | 25 | module Indri |
26 | 26 | |
27 | - class IndriIndex | |
27 | + class IndriIndex < Index | |
28 | 28 | |
29 | - def exec indriquery | |
29 | + def initialize path | |
30 | + raise ArgumentError, 'Index path does not exist' unless File.directory? path | |
31 | + @path = path | |
32 | + end | |
33 | + | |
34 | + def runquery indriquery | |
30 | 35 | raise ArgumentError, 'Argument is not an IndriQuery' unless indriquery.is_a? Indri::IndriQuery |
31 | 36 | |
32 | - query = "IndriRunQuery -query#{indriquery.query} -index=#{@path}" | |
37 | + query = "IndriRunQuery -query=\"#{indriquery.query}\" -index=#{@path}" | |
33 | 38 | |
34 | 39 | query += " -count=#{indriquery.count}" unless indriquery.count.nil? |
35 | 40 | query += " -rule=method:#{indriquery.sm_method},#{indriquery.sm_param}:#{indriquery.sm_value}" unless indriquery.sm_method.nil? |
36 | 41 | query += " #{indriquery.args}" unless indriquery.args.nil? |
42 | + | |
43 | + res = `#{query}` | |
44 | + | |
45 | + res | |
37 | 46 | end |
38 | 47 | end |
39 | 48 | end |
lib/mirimiri/query.rb
... | ... | @@ -84,12 +84,12 @@ |
84 | 84 | attr_accessor :query, :count, :sm_method, :sm_param, :sm_value, :args |
85 | 85 | |
86 | 86 | def initialize atts={},args=nil |
87 | - raise ArgumentError, 'Argument 1 must be a Hash' unless args.is_a? Hash | |
87 | + raise ArgumentError, 'Argument 1 must be a Hash' unless atts.is_a? Hash | |
88 | 88 | atts.each do |k,v| |
89 | 89 | instance_variable_set("@#{k}", v) unless v.nil? |
90 | 90 | end |
91 | 91 | |
92 | - raise ArgumentError, 'Argument 2 must be a String' unless args.is_a? String | |
92 | + raise ArgumentError, 'Argument 2 must be a String' unless (args.is_a?(String) || args.nil?) | |
93 | 93 | @args = args |
94 | 94 | end |
95 | 95 | end |
96 | 96 | |
97 | 97 | |
98 | 98 | |
... | ... | @@ -97,20 +97,31 @@ |
97 | 97 | class IndriQueries |
98 | 98 | attr_accessor :params, :queries |
99 | 99 | |
100 | - def initialize(params,*queries) | |
101 | - @queries = queries | |
100 | + def initialize params | |
101 | +# @queries = queries | |
102 | 102 | |
103 | 103 | @params = params |
104 | + @queries = {} | |
104 | 105 | # Here we set the default retrieval model as Language Modeling |
105 | 106 | # with a Dirichlet smoothing at 2500. |
106 | 107 | # TODO: maybe a Rule class... |
107 | 108 | @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? |
108 | 109 | end |
109 | 110 | |
111 | + def push id,query | |
112 | + @queries[id.to_i] = query | |
113 | + end | |
114 | + | |
110 | 115 | def to_s |
111 | 116 | h = "<parameters>\n" |
112 | 117 | h += @params.to_s |
113 | - h += @queries.collect { |q| q.to_s }.join "" | |
118 | + h += @queries.sort { |a,b| a[0] <=> b[0] }.collect do |q| | |
119 | + "<query>\n" + | |
120 | + "<number>#{q[0]}</number>\n" + | |
121 | + "<text>#{q[1]}</text>\n" + | |
122 | + "</query>\n" | |
123 | + end.join "" | |
124 | +# h += @queries.collect { |q| q.to_s }.join "" | |
114 | 125 | h += "</parameters>" |
115 | 126 | |
116 | 127 | h |
lib/mirimiri/string.rb
... | ... | @@ -161,11 +161,35 @@ |
161 | 161 | Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) } |
162 | 162 | end |
163 | 163 | |
164 | - # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise. | |
164 | + # Returns +true+ if +self+ belongs to Mirimiri::Stoplist, +false+ otherwise. | |
165 | 165 | def is_stopword? |
166 | 166 | Stoplist.include?(self.downcase) |
167 | 167 | end |
168 | 168 | |
169 | + def sequential_dependence_model t=0.85,o=0.10,u=0.05,field=nil | |
170 | + d = Mirimiri::Document.new self | |
171 | + | |
172 | + if field.nil? | |
173 | + ematch = d.ngrams(2).collect { |ng| "#1(#{ng})" } | |
174 | + pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng})" } | |
175 | + else | |
176 | + ematch = d.ngrams(2).collect { |ng| "#1(#{ng}).(#{field})" } | |
177 | + pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng}).(#{field})" } | |
178 | + end | |
179 | + | |
180 | + if ematch.empty? | |
181 | + if field.nil? | |
182 | + ematch = d.words.collect { |ng| "#1(#{ng})" } | |
183 | + pmatch = d.words.collect { |ng| "#uw8(#{ng})" } | |
184 | + else | |
185 | + ematch = d.words.collect { |ng| "#1(#{ng}).(#{field})" } | |
186 | + pmatch = d.words.collect { |ng| "#uw8(#{ng}).(#{field})" } | |
187 | + end | |
188 | + end | |
189 | + | |
190 | + "#weight ( #{t} #combine( #{self} ) #{o} #combine ( #{ematch.join(" ")} ) #{u} #combine ( #{pmatch.join(" ")} ) )" | |
191 | + end | |
192 | + | |
169 | 193 | # Do not use. |
170 | 194 | # TODO: rewamp. find why this function is here. |
171 | 195 | def remove_special_characters |
... | ... | @@ -258,5 +282,14 @@ |
258 | 282 | end |
259 | 283 | |
260 | 284 | private :strip_with_pattern |
285 | +end | |
286 | + | |
287 | +module Indri | |
288 | + class IndriPrintedDocuments < String | |
289 | + | |
290 | + def extract_docs | |
291 | + self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? } | |
292 | + end | |
293 | + end | |
261 | 294 | end |
main.rb
... | ... | @@ -5,4 +5,8 @@ |
5 | 5 | w = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan") |
6 | 6 | p w.entropy("dillinger escape plan") |
7 | 7 | p w.tf("guitar") |
8 | + | |
9 | +query = Indri::IndriQuery.new({:query => "dillinger escape plan".sequential_dependence_model, :count => 10}, "-trecFormat=true -printDocuments=true") | |
10 | +index = Indri::IndriIndex.new "/mnt/disk1/ClueWeb09_English_1noSpam" | |
11 | +s = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8")) |