Commit 65040e3e69f49124aec05e30451769bb1c5f9d01
1 parent
e55598814a
Exists in
master
changes made for the oair paper
Showing 7 changed files with 681 additions and 63 deletions Side-by-side Diff
README.markdown
... | ... | @@ -5,6 +5,8 @@ |
5 | 5 | Stand-alone functions built on top of mirimiri and lda-ruby. |
6 | 6 | Aiming to extract contextual features from general corpora related to search scenario. |
7 | 7 | |
8 | +This code was used to produce the results reported in our OAIR'13 paper : "Unsupervised Latent Concept Modeling to Identify Query Facets". | |
9 | + | |
8 | 10 | License |
9 | 11 | ======= |
10 | 12 |
context.rb
1 | -#!/usr/bin/env ruby | |
2 | - | |
3 | -require 'mirimiri' | |
4 | -require 'sanitize' | |
5 | -require 'lda-ruby' | |
6 | - | |
7 | -module Context | |
8 | - IndexPaths = { | |
9 | - :web_en => '/mnt/disk2/ClueWeb09_English_1_sDocs', | |
10 | - :web_fr => '/mnt/disk2/ClueWeb09_French_1_sDocs', | |
11 | - :web_nospam => '/mnt/disk1/ClueWeb09_English_1noSpam', | |
12 | - :gigaword => '/local/data/GigaWord/index', | |
13 | - :nyt => '/local/data/NYT_index', | |
14 | - :wiki_en => '/local/data/WikiEn_index', | |
15 | - :wiki_fr => '/local/data/WikiFr_index' | |
16 | - } | |
17 | - | |
18 | - def Context.term_context index_path,query,size,num_page,args={} | |
19 | - args[:func] ||= :entropy | |
20 | - args[:window] ||= 1 | |
21 | - | |
22 | - docs = self.feedback_docs index_path,query,num_page | |
23 | - | |
24 | - resource = Mirimiri::Document.new docs.join(' ') | |
25 | - terms = self.extract_ngrams resource,args[:func].to_sym,args[:window] | |
26 | - | |
27 | - context = "#weight ( #{terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| "#{e[0]} #1(#{e[1]})" }.join " "} ) " unless terms.empty? | |
28 | - | |
29 | - context | |
30 | - end | |
31 | - | |
32 | - def Context.topic_context index_path,query,size,num_page,args={} | |
33 | - corpus = Lda::Corpus.new | |
34 | - | |
35 | - docs = self.feedback_docs index_path,query,num_page | |
36 | - docs.each do |d| | |
37 | - doc = Lda::TextDocument.new corpus,d | |
38 | - corpus.add_document doc | |
39 | - end | |
40 | - | |
41 | - lda = Lda::Lda.new corpus | |
42 | - lda.num_topics = num_page/10 | |
43 | - lda.em 'random' | |
44 | - puts lda.top_words(size) | |
45 | - end | |
46 | - | |
47 | - private | |
48 | - def Context.feedback_docs index_path,query,num_page | |
49 | - query = Indri::IndriQuery.new({:query => query, :count => num_page},"-printDocuments=true -trecFormat=true") | |
50 | - index = Indri::IndriIndex.new index_path | |
51 | - idocs = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8")) | |
52 | - | |
53 | - docs = idocs.extract_docs.collect { |idoc| Sanitize.clean idoc,:remove_contents => ['script'] } | |
54 | - docs | |
55 | - end | |
56 | - | |
57 | - def Context.extract_ngrams resource,func,n | |
58 | - raw_terms = 1.upto(n).collect { |i| resource.ngrams(i) }.flatten | |
59 | - terms = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.all? { |e| e.length <= 1 } || w.split.all? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") } | |
60 | - terms | |
61 | - end | |
62 | - | |
63 | -end |
lib/context.rb
1 | +#!/usr/bin/env ruby | |
2 | + | |
3 | +require 'mirimiri' | |
4 | +require 'sanitize' | |
5 | +require 'lda-ruby' | |
6 | +require 'context/conceptual_element' | |
7 | +require 'context/concept_model' | |
8 | +require 'context/concept' | |
9 | +require 'context/query_context' | |
10 | + | |
11 | + | |
12 | +module Context | |
13 | + @@count = Hash.new { |h,k| h[k] = {} } | |
14 | + @@df = Hash.new { |h,k| h[k] = {} } | |
15 | + @@semaphore = Mutex.new | |
16 | + | |
17 | + IndexPaths = { | |
18 | + :web_en => '/mnt/disk2/ClueWeb09_English_1_sDocs', | |
19 | + :web_fr => '/mnt/disk2/ClueWeb09_French_1_sDocs', | |
20 | + :web_nospam => '/mnt/disk1/ClueWeb09_English_1noSpam', | |
21 | + :robust => '/mnt/disk5/Robust04/', | |
22 | + :wt10g => '/mnt/disk3/WT10g_index', | |
23 | + :gov2 => '/mnt/disk3/GOV2_index', | |
24 | + :gigaword => '/local/data/GigaWord/index', | |
25 | + :nyt => '/local/data/NYT_index', | |
26 | + :wiki_en => '/local/data/WikiEn_index', | |
27 | + :wiki_en2012 => '/local/data/WikiEn2012_index', | |
28 | + :wiki_fr => '/local/data/WikiFr_index', | |
29 | + :wiki_tc2012 => '/local/data/INEXQA2012index', | |
30 | + :books => '/local/data/INEX/Books2011/indexedit', | |
31 | + :ent => '/home/sanjuan/works/nist_eval/csiro_indri.ind' | |
32 | + } | |
33 | + | |
34 | + IndexPathsCaracole = { | |
35 | + :web_en => '/distant/index_clueweb/disk2/ClueWeb09_English_1_sDocs', | |
36 | + :web_nospam => '/distant/index_clueweb/disk1/ClueWeb09_English_1noSpam', | |
37 | + :robust => '/distant/data/Robust04', | |
38 | + :wt10g => '/distant/index_clueweb/disk3/WT10g_index', | |
39 | + :gov2 => '/distant/index_clueweb/disk3/GOV2_index' | |
40 | + } | |
41 | + | |
42 | + # # | |
43 | + # From the SIGKDD 2007 paper : "Exploiting underrepresented query aspects for automatic query expansion" | |
44 | + def Context.query_aspects q | |
45 | + query = Mirimiri::Document.new q | |
46 | + | |
47 | + 2.upto(query.words.count) do |size| | |
48 | + query.ngrams(size).each do |s| | |
49 | + dp = Context.count_w Context::IndexPaths[:wiki_en2012],"#1(#{s})" | |
50 | + d = Context.count_w Context::IndexPaths[:wiki_en2012],"#{s}",100000000 | |
51 | + p s | |
52 | + | |
53 | + denum = s.split.permutation.inject(0.0) do |res,p| | |
54 | + tmp = (p == s.split) ? 0 : Context.count_w(Context::IndexPaths[:wiki_en2012],"#1(#{p.join(" ")})") | |
55 | + res + tmp | |
56 | + end | |
57 | + | |
58 | + existence = dp.to_f/d | |
59 | + support = dp.to_f/denum | |
60 | + puts "#{s} ===> #{existence*support}" | |
61 | + end | |
62 | + end | |
63 | + end | |
64 | + | |
65 | + # # | |
66 | + # From the CIKM 2007 paper : "Ranking Very Many Typed Entities on Wikipedia" | |
67 | + # | |
68 | + # The ``entities`` parameter is currently an array of strings. Could be moved | |
69 | + # to an array of Entity objects. | |
70 | + def Context.entity_web_ranking query,entities,nbdocs=100,index='web_nospam' | |
71 | + q = Indri::IndriQuery.new({:query => "#combine ( #{query} )", :count => nbdocs},"-trecFormat=true") | |
72 | + indri_index = Indri::IndriIndex.new IndexPaths[index.to_sym] | |
73 | + docs = indri_index.runquery(q).force_encoding("ISO-8859-1").encode("UTF-8") | |
74 | + query_list = docs.split("\n").collect { |p| p.scan(/\d+ Q0 (.+) \d+ -\d+.\d+ .+/).first }.flatten | |
75 | + | |
76 | + res = entities.pmap(15) do |e| | |
77 | + eq = Indri::IndriQuery.new({:query => "#combine ( #{e.gsub(/[^a-zA-Z0-9\s]/,'')} )", :count => nbdocs},"-trecFormat=true") | |
78 | + edocs = indri_index.runquery(eq).force_encoding("ISO-8859-1").encode("UTF-8") | |
79 | + e_list = edocs.split("\n").collect { |p| p.scan(/\d+ Q0 (.+) \d+ -\d+.\d+ .+/).first }.flatten | |
80 | + | |
81 | + rels = e_list&query_list | |
82 | + | |
83 | + ave_p = 1.upto(nbdocs).inject(0.0) do |sum,k| | |
84 | + p = (e_list.first(k)&rels).count.to_f/k | |
85 | + rel = rels.include?(e_list[k-1]) ? 1.0 : 0.0 | |
86 | + sum + p*rel | |
87 | + end | |
88 | + | |
89 | + {:name => e, :score => ave_p} | |
90 | + end | |
91 | + | |
92 | + res.sort { |a,b| b[:score] <=> a[:score] } | |
93 | + end | |
94 | + | |
95 | + def Context.query_entities query,nb_docs=10 | |
96 | + sources = ['wiki_en2012'] | |
97 | +# sources = ['wiki_en2012','web_nospam','nyt','gigaword'] | |
98 | +# sources = ['web_fr'] | |
99 | + sc = Hash.new { |h,k| h[k] = 0.0 } | |
100 | + | |
101 | + sources.each do |source| | |
102 | + puts " == source : #{source}" | |
103 | + c = ConceptModel.new query,source,nb_docs | |
104 | + p c.query | |
105 | + | |
106 | + c.concepts.each do |concept| | |
107 | + querys = concept.words[0,4].join " " | |
108 | + | |
109 | + d1 = Context::label_candidate querys.sequential_dependence_model,'wiki_en' | |
110 | + d2 = Context::label_candidate querys.sequential_dependence_model,'wiki_en2012' | |
111 | + d3 = Mirimiri::WikipediaPage.search_wikipedia_titles querys | |
112 | + | |
113 | + d1 = [] if d1.nil? | |
114 | + d2 = [] if d2.nil? | |
115 | + d3 = [] if d3.nil? | |
116 | + | |
117 | + d = d2 & d3 | |
118 | + labels = d.collect { |c| c.downcase.gsub(/[^\w\d]/,' ') } | |
119 | + p d | |
120 | + | |
121 | + | |
122 | + mins = -10000000 | |
123 | + lab = nil | |
124 | + scores = labels.collect do |l| | |
125 | + s = concept.score_label l | |
126 | + if s > mins | |
127 | + mins = s | |
128 | + lab = l | |
129 | + end | |
130 | + sc[l] += s*(concept.coherence/c.total_coherence) | |
131 | + { :label => l, :score => s } | |
132 | + end | |
133 | + | |
134 | + print (concept.coherence/c.total_coherence).to_s+" <= " | |
135 | + p concept.elements.collect { |c| c.word } | |
136 | + end | |
137 | + end | |
138 | + | |
139 | + sc.sort { |a,b| b[1] <=> a[1] } | |
140 | + end | |
141 | + | |
142 | + def Context.label_candidate query,index,nb_candidates,rm3=false | |
143 | +# Mirimiri::WikipediaPage.search_wikipedia_titles query | |
144 | + args = rm3 ? "-fbDocs=20 -fbTerms=30 -fbMu=2000 -fbOrigWeight=0.7" : "" | |
145 | + q = Indri::IndriQuery.new({:query => query, :count => nb_candidates},"-printDocuments=true -trecFormat=true #{args}") | |
146 | + index = Indri::IndriIndex.new IndexPaths[index.to_sym] | |
147 | + docs = index.runquery q | |
148 | + docs = docs.force_encoding("ISO-8859-1").encode("UTF-8") if ['web_fr','web_en','web_nospam'].include? index | |
149 | + idocs = Indri::IndriPrintedDocuments.new(docs) | |
150 | + | |
151 | + wiki_titles = idocs.extract_docs.collect do |d| | |
152 | + t = Nokogiri::HTML d | |
153 | + t.xpath('//title').text | |
154 | + end | |
155 | + | |
156 | + wiki_titles | |
157 | + end | |
158 | + | |
159 | + def Context.lcm query | |
160 | + source = 'nyt' | |
161 | + | |
162 | + a = Time.now | |
163 | + qc = QueryContext.new(1.upto(20).collect do |nb_docs| | |
164 | + beg = Time.now | |
165 | + c = ConceptModel.new query,source,nb_docs | |
166 | + puts "#{nb_docs} ==> Time elapsed: #{Time.now-beg} seconds" | |
167 | + c | |
168 | + end) | |
169 | + puts "All concepts : #{Time.now-a} seconds" | |
170 | + | |
171 | + model = qc.best_concept_model | |
172 | + puts "Total : #{Time.now-a} seconds" | |
173 | + model | |
174 | + end | |
175 | + | |
176 | + def Context.term_context index_path,query,size,num_page,args={} | |
177 | + terms = self.term_concepts index_path,query,size,num_page,args | |
178 | + args[:window] ||= 1 | |
179 | + | |
180 | +# context = "#weight ( #{terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| "#{e[0]} #1(#{e[1]})" }.join " "} ) " unless terms.empty? | |
181 | + context = "#weight ( #{terms.collect { |c| "#{"%.10f" % c[:score]} #uw#{args[:window]}(#{c[:concept]})" }.join " "} ) " unless terms.empty? | |
182 | + | |
183 | + context | |
184 | + end | |
185 | + | |
186 | +# From SIGIR'06 paper : `Improving the estimation of relevance models using large external corpora` | |
187 | +# | |
188 | + def Context.morm index_path,query,size,num_page | |
189 | + docs,scores,names = self.feedback_docs index_path,query,num_page | |
190 | + | |
191 | + terms = [] | |
192 | + | |
193 | + docs.each do |d| | |
194 | + r = Mirimiri::Document.new d | |
195 | + tmp = self.extract_ngrams r,:tf,1 | |
196 | + terms += tmp.compact.collect { |t| [t[0]*Math.exp(scores[docs.index(d)].to_f),t[1]] } | |
197 | + end | |
198 | + | |
199 | + final = terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| { :score => e[0], :concept => e[1] } } | |
200 | + context = "#weight ( #{final.collect { |c| "#{"%.10f" % c[:score]} #1(#{c[:concept]})" }.join " "} ) " unless terms.empty? | |
201 | + | |
202 | + context | |
203 | + end | |
204 | + | |
205 | + def Context.term_concepts index_path,query,size,num_page,args={} | |
206 | + args[:func] ||= :entropy | |
207 | + args[:window] ||= 1 | |
208 | + | |
209 | + docs = self.feedback_docs index_path,query,num_page | |
210 | + | |
211 | + resource = Mirimiri::Document.new docs.join(' ') | |
212 | + terms = self.extract_ngrams resource,args[:func].to_sym,args[:window] | |
213 | + | |
214 | + terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| { :score => e[0], :concept => e[1] } } | |
215 | + end | |
216 | + | |
217 | + | |
218 | + def Context.sentence_similarity s1,s2,index_path | |
219 | + q = s1.is_a?(String) ? s1.split : s1 | |
220 | + r = s2.is_a?(String) ? s2.split : s2 | |
221 | + | |
222 | + inter = q & r | |
223 | + | |
224 | + s = (inter.count/q.count.to_f) * inter.inject(0.0) { |sum,w| sum + Math.log(Context.df index_path,w) } | |
225 | + s | |
226 | + end | |
227 | + | |
228 | + | |
229 | + private | |
230 | + | |
231 | + def Context.df index_path,w,window=1 | |
232 | + if @@count[index_path]["total#{window}"].nil? | |
233 | + total = `dumpindex #{index_path} s`.match(/total terms:\t(.*)/)[1].to_f-(window-1).to_f | |
234 | + @@semaphore.synchronize { | |
235 | + @@count[index_path]["total#{window}"] = total | |
236 | + } | |
237 | + end | |
238 | + | |
239 | + if @@df[index_path]["#uw#{window}(#{w})"].nil? | |
240 | + nb = `dumpindex #{index_path} e "#uw#{window}(#{w})" | awk ' { arr[$1]=$0 } END { for ( key in arr ) { print arr[key] } } ' | wc -l`.chomp.split(':').last.to_f - 1 | |
241 | + @@semaphore.synchronize { | |
242 | + @@df[index_path]["#uw#{window}(#{w})"] = nb+1.0 | |
243 | + } | |
244 | + end | |
245 | + begin | |
246 | + d = @@count[index_path]["total#{window}"]/@@df[index_path]["#uw#{window}(#{w})"] | |
247 | + rescue | |
248 | + puts w | |
249 | + exit | |
250 | + end | |
251 | + d | |
252 | + end | |
253 | + | |
254 | + def Context.prob_w index_path,w,window=1 | |
255 | + if @@count[index_path]["total#{window}"].nil? | |
256 | + total = `dumpindex #{index_path} s`.match(/total terms:\t(.*)/)[1].to_f-(window-1).to_f | |
257 | + @@semaphore.synchronize { | |
258 | + @@count[index_path]["total#{window}"] = total+1.0 | |
259 | + } | |
260 | + end | |
261 | + | |
262 | + nb = self.count_w index_path,w,window | |
263 | + nb/@@count[index_path]["total#{window}"] | |
264 | + end | |
265 | + | |
266 | + def Context.count_w index_path,w,window=1 | |
267 | + if @@count[index_path]["##{window}(#{w})"].nil? | |
268 | + nb = `dumpindex #{index_path} x "##{window}(#{w})"`.chomp.split(':').last.to_f | |
269 | + @@semaphore.synchronize { | |
270 | + @@count[index_path]["##{window}(#{w})"] = ("%.15f" % nb).to_f+1.0 | |
271 | + } | |
272 | + end | |
273 | + @@count[index_path]["##{window}(#{w})"] | |
274 | + end | |
275 | + | |
276 | + | |
277 | + public | |
278 | + def Context.extract_ngrams resource,func,n | |
279 | + raw_terms = 1.upto(n).collect { |i| resource.ngrams(i) }.flatten | |
280 | +# raw_terms = resource.ngrams(n).flatten | |
281 | + terms = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.any? { |e| e.length <= 2 } || w.split.any? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") || (Mirimiri::Stoplist&w.unaccent.split).count >= 2 } | |
282 | +# terms = raw_terms.uniq.collect { |w| w=w.gsub(/\W/,' ').strip; [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.any? { |e| e.length <= 2 } || w.split.any? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") || (Mirimiri::Stoplist&w.unaccent.split).count >= 1 } | |
283 | +# terms = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.all? { |e| e.length <= 1 } || w.split.all? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") } | |
284 | + terms | |
285 | + end | |
286 | + | |
287 | + def Context.feedback_docs index_path,query,num_page | |
288 | + query = Indri::IndriQuery.new({:query => query, :count => num_page},"-printDocuments=true -trecFormat=true") | |
289 | + index = Indri::IndriIndex.new index_path | |
290 | + idocs = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8")) | |
291 | + | |
292 | + texts,scores,names = idocs.extract_docs_score | |
293 | + | |
294 | + docs = texts.collect do |idoc| | |
295 | + begin | |
296 | + Sanitize.clean idoc,:remove_contents => ['script','style'] | |
297 | + rescue | |
298 | + d = Nokogiri::HTML(idoc) | |
299 | + d.xpath('//text()').text | |
300 | + end | |
301 | + end | |
302 | + | |
303 | + return docs,scores,names | |
304 | + end | |
305 | + | |
306 | +end |
lib/context/concept.rb
1 | +#!/usr/bin/env ruby | |
2 | + | |
3 | +class Concept | |
4 | + attr_reader :elements, :coherence | |
5 | + | |
6 | + def initialize | |
7 | + @elements = [] | |
8 | + @coherence = 0 | |
9 | + end | |
10 | + | |
11 | + def <<(elem) | |
12 | + raise ArgumentError, 'Argument must be a ConceptualElement.' unless elem.is_a? ConceptualElement | |
13 | + @elements << elem | |
14 | + end | |
15 | + | |
16 | + def compute_coherence scores,theta,k#arg=nil | |
17 | +# update_feedback_coherence arg | |
18 | + @coherence = 0.upto(theta.count-1).inject(0.0) do |sum,i| | |
19 | + sum + Math.exp(theta[i][k])*Math.exp(scores[i].to_f) | |
20 | + end | |
21 | + end | |
22 | + | |
23 | + def score_label label,index_path=Context::IndexPaths[:wiki_en2012] | |
24 | + s = @elements.inject(0.0) do |res,e| | |
25 | +# *self.prob_w(index_path,"#{w[:word]} #uw10(#{label})") | |
26 | + res + e.prob*Math.log(Context.prob_w(index_path,"#{e.word} #uw10(#{label})")/(e.p_in_coll*Context.prob_w(index_path,label))) | |
27 | + end | |
28 | + | |
29 | + s | |
30 | + end | |
31 | + | |
32 | + def concept_words_similarity s,index_path=Context::IndexPaths[:wiki_en] | |
33 | +# inter = @elements.collect { |w| w unless (w.word & s).empty? } | |
34 | + inter = self.words & s.words | |
35 | + | |
36 | + sim = (inter.count/self.words.count.to_f) * inter.inject(0.0) { |sum,w| sum + Math.log(Context.df index_path,w) } | |
37 | + sim | |
38 | + end | |
39 | + | |
40 | + def weighted_concept_similarity s,index_path=Context::IndexPaths[:wiki_en] | |
41 | + inter = self.words & s.words | |
42 | + sim = (inter.count/self.words.count.to_f) | |
43 | + sim *= @elements.inject(0.0) do |sum,w| | |
44 | + wp = s.get_element_from_word w.word | |
45 | + | |
46 | + s.words.include?(w.word) ? sum + wp.prob*w.prob*Math.log(Context.df index_path,w.word) : sum + 0.0 | |
47 | + end | |
48 | + sim | |
49 | + end | |
50 | + | |
51 | + def get_element_from_word w | |
52 | + @elements.select { |e| e.word == w }.first | |
53 | + end | |
54 | + | |
55 | + def words | |
56 | + @elements.collect { |w| w.word } | |
57 | + end | |
58 | + | |
59 | + def word_probs | |
60 | + res = {} | |
61 | + @elements.each { |w| res[w.word] = w.prob } | |
62 | + res | |
63 | + end | |
64 | + | |
65 | +# From papers : | |
66 | +# NAACL'10: `Automatic Evaluation of Topic Coherence` | |
67 | +# EMNLP'12: `Exploring Topic Coherence over many models and many topics` | |
68 | +# | |
69 | + def uci_coherence epsilon=1,index_path=Context::IndexPaths[:wiki_en] | |
70 | + coherence = @elements.combination(2).inject(0.0) do |res,bigram| | |
71 | +#Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)* | |
72 | + t = (Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)+epsilon)/((bigram.first.p_in_coll)*(bigram.last.p_in_coll)) | |
73 | + res + Math.log(t) | |
74 | + end | |
75 | + | |
76 | + coherence /= @elements.count*(@elements.count-1) | |
77 | + coherence | |
78 | + end | |
79 | + | |
80 | + def uci_coherence_entropy index_path=Context::IndexPaths[:wiki_en] | |
81 | + coherence = @elements.combination(2).inject(0.0) do |res,bigram| | |
82 | +#Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)* | |
83 | + t = (Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20))/((bigram.first.p_in_coll)*(bigram.last.p_in_coll)) | |
84 | + res + t | |
85 | + end | |
86 | + | |
87 | + coherence /= @elements.count*(@elements.count-1) | |
88 | + coherence | |
89 | + end | |
90 | + | |
91 | + protected | |
92 | + def update_coherence index_path=Context::IndexPaths[:wiki_en] | |
93 | + coherence = @elements.combination(2).inject(0.0) do |res,bigram| | |
94 | + res + Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}")*Math.log(Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}")/((bigram.first.p_in_coll index_path)*(bigram.last.p_in_coll index_path))) | |
95 | + end | |
96 | + | |
97 | + coherence /= @elements.count*(@elements.count-1) | |
98 | + @coherence = coherence | |
99 | + end | |
100 | + | |
101 | + def update_feedback_coherence documents | |
102 | + corpus = Mirimiri::Document.new documents.join " " | |
103 | + | |
104 | + windows = corpus.ngrams(10).collect { |w| w.split } | |
105 | + | |
106 | + coherence = @elements.combination(2).inject(0.0) do |res,bigram| | |
107 | + big_prob = windows.count{ |c| c.include?(bigram.first.word) && c.include?(bigram.last.word) }.to_f/windows.count | |
108 | + mi = big_prob.zero? ? 0.0 : big_prob*bigram.first.prob*bigram.last.prob*Math.log(big_prob/(corpus.tf(bigram.first.word)*corpus.tf(bigram.last.word))) | |
109 | + res + mi | |
110 | + end | |
111 | + | |
112 | + coherence /= @elements.count*(@elements.count-1) | |
113 | + @coherence = coherence | |
114 | + end | |
115 | + | |
116 | +end |
lib/context/concept_model.rb
1 | +#!/usr/bin/env ruby | |
2 | + | |
3 | +require 'lda-ruby' | |
4 | +require 'peach' | |
5 | + | |
6 | +class ConceptModel | |
7 | + attr_reader :concepts,:documents,:source,:nbdocs,:nbterms,:query,:total_coherence,:doc_scores,:doc_names,:theta,:entropy_coherence,:avg_coherence | |
8 | + | |
9 | + def ConceptModel.parse_hdp str | |
10 | + concepts = [] | |
11 | + eval(str).each do |hdp_top| | |
12 | + c = Concept.new | |
13 | + hdp_top.gsub(/topic \d: /,'').split(" + ").each do |words| | |
14 | + ee = words.split('*') | |
15 | + begin | |
16 | + e = ConceptualElement.new ee[1],ee[0].to_f | |
17 | + c << e | |
18 | + rescue ArgumentError | |
19 | + next | |
20 | + end | |
21 | + end | |
22 | + | |
23 | + concepts << c | |
24 | + end | |
25 | + concepts | |
26 | + end | |
27 | + | |
28 | + def initialize query,source,nb_docs,nb_terms=10,k=false | |
29 | + raise ArgumentError, 'Argument 1 must be a String.' unless query.is_a? String | |
30 | + raise ArgumentError, 'Argument 2 must be a valid Index key.' unless Context::IndexPaths.has_key?(source.to_sym) | |
31 | + | |
32 | + @source = source.to_sym | |
33 | + @nbdocs = nb_docs | |
34 | + @nbterms = nb_terms | |
35 | + @query = query | |
36 | + @concepts = [] | |
37 | + @total_coherence = 0.0 | |
38 | + | |
39 | + corpus = Lda::Corpus.new | |
40 | + | |
41 | + @documents,@doc_scores,@doc_names = Context.feedback_docs Context::IndexPaths[@source],@query,@nbdocs | |
42 | + @documents.each do |d| | |
43 | + doc = Lda::TextDocument.new corpus,d | |
44 | + corpus.add_document doc | |
45 | + end | |
46 | + | |
47 | + if k == false | |
48 | + num_topics = topic_divergence corpus | |
49 | + else | |
50 | + num_topics = k | |
51 | + end | |
52 | + | |
53 | + lda = Lda::Lda.new corpus | |
54 | + lda.verbose=false | |
55 | + lda.num_topics = num_topics | |
56 | + | |
57 | + lda.em('random') | |
58 | + | |
59 | + @beta = lda.beta # to avoid repeated expensive computation | |
60 | + @vocab = lda.vocab # | |
61 | + | |
62 | + @theta = lda.compute_topic_document_probability | |
63 | + | |
64 | +# Normalizing the phi_t(w) weights for each topic | |
65 | +# | |
66 | + total_prob = {} | |
67 | + tmp_top_word_indices(@nbterms,@vocab,@beta).each_pair do |topic,indices| | |
68 | + total_prob[topic] = indices.inject(0.0) { |res,i| res + Math.exp(@beta[topic][i].to_f) } | |
69 | + end | |
70 | + | |
71 | + tmp_top_word_indices(@nbterms,@vocab,@beta).each_pair do |topic,indices| | |
72 | + c = Concept.new | |
73 | + indices.each do |i| | |
74 | + begin | |
75 | + e = ConceptualElement.new @vocab[i],(Math.exp(@beta[topic][i].to_f)/total_prob[topic]) | |
76 | + c << e | |
77 | + rescue ArgumentError | |
78 | + next | |
79 | + end | |
80 | + end | |
81 | + | |
82 | + c.compute_coherence @doc_scores,@theta,topic | |
83 | + | |
84 | +# c.compute_coherence @doc_scores,gamma_m,topic # takes time since it has to compute several probabilities | |
85 | + @concepts << c | |
86 | + @total_coherence += c.coherence | |
87 | + end | |
88 | + end | |
89 | + | |
90 | + def to_s | |
91 | + @concepts.collect do |c| | |
92 | + "#{c.coherence/@total_coherence} => [#{c.elements.collect do |e| | |
93 | + "#{e.prob} #{e.word}" | |
94 | + end.join(', ') | |
95 | + }]" | |
96 | + end.join "\n" | |
97 | + end | |
98 | + | |
99 | + def to_indriq | |
100 | + "#weight( #{@concepts.collect do |c| | |
101 | + "#{c.coherence/@total_coherence} #weight ( #{c.elements.collect do |e| | |
102 | + "#{e.prob} #{e.word}" | |
103 | + end.join(' ') | |
104 | + } ) " | |
105 | + end.join " "} )" | |
106 | + end | |
107 | + | |
108 | + def <<(concept) | |
109 | + raise ArgumentError, 'Argument must be a Concept.' unless elem.is_a? Concept | |
110 | + @concepts << concept | |
111 | + end | |
112 | + | |
113 | + def avg_model_coherence | |
114 | + if @documents.empty? | |
115 | + @avg_coherence = 0.0 | |
116 | + else | |
117 | + @avg_coherence = @concepts.inject(0.0) { |res,c| res + c.uci_coherence }/@concepts.count #if @avg_coherence.nil? | |
118 | + end | |
119 | + @avg_coherence | |
120 | + end | |
121 | + | |
122 | + def entropy_model_coherence | |
123 | + if @documents.empty? | |
124 | + @entropy_coherence = 0.0 | |
125 | + else | |
126 | + @entropy_coherence = @concepts.inject(0.0) do |res,c| | |
127 | + ent = c.uci_coherence_entropy | |
128 | + ent += 0.0000000000000000000000001 if ent.zero? | |
129 | + res + ent*Math.log(ent) | |
130 | + end #if @entropy_coherence.nil? | |
131 | + end | |
132 | + @entropy_coherence | |
133 | + end | |
134 | + | |
135 | + private | |
136 | + def topic_divergence corpus | |
137 | + max_kl = 0.0 | |
138 | +# Old trick to limit number of iterations | |
139 | +# num_p = @nbdocs < 6 ? @nbdocs + 5 : @nbdocs | |
140 | + | |
141 | + semaphore = Mutex.new | |
142 | + | |
143 | + 1.upto(20).inject do |k,ntop| | |
144 | +# 1.upto(num_p).inject do |k,ntop| | |
145 | + lda = Lda::Lda.new corpus | |
146 | + lda.verbose=false | |
147 | + lda.num_topics = ntop | |
148 | + lda.em('random') | |
149 | + beta_m = lda.beta # to avoid repeated expensive computation | |
150 | + vocab = lda.vocab | |
151 | + | |
152 | + topics_i = Array.new(ntop) { |i| i } | |
153 | + | |
154 | + sum_kl = topics_i.combination(2).inject(0.0) do |kl,topics| | |
155 | + ti = topics.first | |
156 | + tj = topics.last | |
157 | + begin | |
158 | + kl + 0.upto(vocab.count-1).inject(0.0) do |res,w_i| | |
159 | + res + ( Math.exp(beta_m[ti][w_i])*Math.log(Math.exp(beta_m[ti][w_i])/Math.exp(beta_m[tj][w_i])) + Math.exp(beta_m[tj][w_i])*Math.log(Math.exp(beta_m[tj][w_i])/Math.exp(beta_m[ti][w_i])) ) | |
160 | + end | |
161 | + rescue | |
162 | + kl + 0.0 | |
163 | + end | |
164 | + end | |
165 | + | |
166 | + sum_kl /= ntop*(ntop-1) | |
167 | + sum_kl = max_kl if sum_kl.nan? || sum_kl.infinite? | |
168 | + | |
169 | + sum_kl <= max_kl ? k : (max_kl = sum_kl and ntop) | |
170 | + end | |
171 | + end | |
172 | + | |
173 | + def tmp_top_word_indices(words_per_topic = 10,vocab,beta) | |
174 | + raise 'No vocabulary loaded.' unless vocab | |
175 | + | |
176 | + # find the highest scoring words per topic | |
177 | + topics = Hash.new | |
178 | + indices = (0...vocab.size).to_a | |
179 | + | |
180 | + beta.each_with_index do |topic, topic_num| | |
181 | + topics[topic_num] = (topic.zip((0...vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic] | |
182 | + end | |
183 | + | |
184 | + topics | |
185 | + end | |
186 | + | |
187 | +end |
lib/context/conceptual_element.rb
1 | +#!/usr/bin/env ruby | |
2 | + | |
3 | +class ConceptualElement | |
4 | + attr_reader :word, :prob | |
5 | + | |
6 | + def initialize w,s | |
7 | + raise ArgumentError, 'Argument 1 must be a String.' unless w.is_a? String | |
8 | + raise ArgumentError, 'Argument 2 must be a Float.' unless s.is_a? Float | |
9 | + | |
10 | + tmp = w.gsub(/(-)\1+/,'-').gsub(/([^\w-].*|^-|-$)/,'') | |
11 | + raise ArgumentError, 'Arguments 1 is not a useful word ! ;)' if tmp.is_stopword? || tmp.size < 2 | |
12 | + | |
13 | + @word = tmp | |
14 | + @prob = s | |
15 | + end | |
16 | + | |
17 | + def p_in_coll index_path=Context::IndexPaths[:wiki_en],size=20 | |
18 | + Context.prob_w index_path,@word,size | |
19 | + end | |
20 | +end |
lib/context/query_context.rb
1 | +#!/usr/bin/env ruby | |
2 | + | |
3 | +class QueryContext < Array | |
4 | + | |
5 | + def best_concept_model | |
6 | + max_sim = 0.0 | |
7 | + best = nil | |
8 | + | |
9 | + for p in 0...self.count | |
10 | + sim = 0.0 | |
11 | + for pp in 0...self.count | |
12 | + next if pp == p | |
13 | + combs = self.at(p).concepts.product self.at(pp).concepts | |
14 | + sum_sim = combs.inject(0.0) { |sum,k| sum + k.first.weighted_concept_similarity(k.last) } | |
15 | + sim += sum_sim/combs.count | |
16 | + end | |
17 | + | |
18 | + | |
19 | + if sim > max_sim | |
20 | + max_sim = sim | |
21 | + best = p | |
22 | + end | |
23 | + end | |
24 | + | |
25 | + best.nil? ? nil : self.at(best) | |
26 | + end | |
27 | + | |
28 | + def best_concept_model_word | |
29 | + max_sim = 0.0 | |
30 | + best = nil | |
31 | + | |
32 | + for p in 0...self.count | |
33 | + sim = 0.0 | |
34 | + for pp in 0...self.count | |
35 | + next if pp == p | |
36 | + combs = self.at(p).concepts.product self.at(pp).concepts | |
37 | + sum_sim = combs.inject(0.0) { |sum,k| sum + k.first.concept_words_similarity(k.last) } | |
38 | + sim += sum_sim/combs.count | |
39 | + end | |
40 | + | |
41 | + | |
42 | + if sim > max_sim | |
43 | + max_sim = sim | |
44 | + best = p | |
45 | + end | |
46 | + end | |
47 | + | |
48 | + best.nil? ? nil : self.at(best) | |
49 | + end | |
50 | +end |