Commit 65040e3e69f49124aec05e30451769bb1c5f9d01
1 parent
e55598814a
Exists in
master
changes made for the oair paper
Showing 7 changed files with 681 additions and 63 deletions Inline Diff
README.markdown
1 | # context | 1 | # context |
2 | 2 | ||
3 | Copyright (C) 2012 Romain Deveaud <romain.deveaud@gmail.com> | 3 | Copyright (C) 2012 Romain Deveaud <romain.deveaud@gmail.com> |
4 | 4 | ||
5 | Stand-alone functions built on top of mirimiri and lda-ruby. | 5 | Stand-alone functions built on top of mirimiri and lda-ruby. |
6 | Aiming to extract contextual features from general corpora related to search scenario. | 6 | Aiming to extract contextual features from general corpora related to search scenario. |
7 | 7 | ||
8 | This code was used to produce the results reported in our OAIR'13 paper : "Unsupervised Latent Concept Modeling to Identify Query Facets". | ||
9 | |||
8 | License | 10 | License |
9 | ======= | 11 | ======= |
10 | 12 | ||
11 | This program is free software: you can redistribute it and/or modify | 13 | This program is free software: you can redistribute it and/or modify |
12 | it under the terms of the GNU General Public License as published by | 14 | it under the terms of the GNU General Public License as published by |
13 | the Free Software Foundation, either version 3 of the License, or | 15 | the Free Software Foundation, either version 3 of the License, or |
14 | (at your option) any later version. | 16 | (at your option) any later version. |
15 | 17 | ||
16 | This program is distributed in the hope that it will be useful, | 18 | This program is distributed in the hope that it will be useful, |
17 | but WITHOUT ANY WARRANTY; without even the implied warranty of | 19 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
18 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 20 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
19 | GNU General Public License for more details. | 21 | GNU General Public License for more details. |
20 | 22 | ||
21 | You should have received a copy of the GNU General Public License | 23 | You should have received a copy of the GNU General Public License |
22 | along with this program. If not, see <http://www.gnu.org/licenses/>. | 24 | along with this program. If not, see <http://www.gnu.org/licenses/>. |
23 | 25 |
context.rb
1 | #!/usr/bin/env ruby | File was deleted | |
2 | |||
3 | require 'mirimiri' | ||
4 | require 'sanitize' | ||
5 | require 'lda-ruby' | ||
6 | |||
7 | module Context | ||
8 | IndexPaths = { | ||
9 | :web_en => '/mnt/disk2/ClueWeb09_English_1_sDocs', | ||
10 | :web_fr => '/mnt/disk2/ClueWeb09_French_1_sDocs', | ||
11 | :web_nospam => '/mnt/disk1/ClueWeb09_English_1noSpam', | ||
12 | :gigaword => '/local/data/GigaWord/index', | ||
13 | :nyt => '/local/data/NYT_index', | ||
14 | :wiki_en => '/local/data/WikiEn_index', | ||
15 | :wiki_fr => '/local/data/WikiFr_index' | ||
16 | } | ||
17 | |||
18 | def Context.term_context index_path,query,size,num_page,args={} | ||
19 | args[:func] ||= :entropy | ||
20 | args[:window] ||= 1 | ||
21 | |||
22 | docs = self.feedback_docs index_path,query,num_page | ||
23 | |||
24 | resource = Mirimiri::Document.new docs.join(' ') | ||
25 | terms = self.extract_ngrams resource,args[:func].to_sym,args[:window] | ||
26 | |||
27 | context = "#weight ( #{terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| "#{e[0]} #1(#{e[1]})" }.join " "} ) " unless terms.empty? | ||
28 | |||
29 | context | ||
30 | end | ||
31 | |||
32 | def Context.topic_context index_path,query,size,num_page,args={} | ||
33 | corpus = Lda::Corpus.new | ||
34 | |||
35 | docs = self.feedback_docs index_path,query,num_page | ||
36 | docs.each do |d| | ||
37 | doc = Lda::TextDocument.new corpus,d | ||
38 | corpus.add_document doc | ||
39 | end | ||
40 | |||
41 | lda = Lda::Lda.new corpus | ||
42 | lda.num_topics = num_page/10 | ||
43 | lda.em 'random' | ||
44 | puts lda.top_words(size) | ||
45 | end | ||
46 | |||
47 | private | ||
48 | def Context.feedback_docs index_path,query,num_page | ||
49 | query = Indri::IndriQuery.new({:query => query, :count => num_page},"-printDocuments=true -trecFormat=true") | ||
50 | index = Indri::IndriIndex.new index_path | ||
51 | idocs = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8")) | ||
52 | |||
53 | docs = idocs.extract_docs.collect { |idoc| Sanitize.clean idoc,:remove_contents => ['script'] } | ||
54 | docs | ||
55 | end | ||
56 | |||
57 | def Context.extract_ngrams resource,func,n | ||
58 | raw_terms = 1.upto(n).collect { |i| resource.ngrams(i) }.flatten | ||
59 | terms = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.all? { |e| e.length <= 1 } || w.split.all? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") } | ||
60 | terms | ||
61 | end | ||
62 | |||
63 | end | ||
64 | 1 | #!/usr/bin/env ruby |
lib/context.rb
File was created | 1 | #!/usr/bin/env ruby | |
2 | |||
3 | require 'mirimiri' | ||
4 | require 'sanitize' | ||
5 | require 'lda-ruby' | ||
6 | require 'context/conceptual_element' | ||
7 | require 'context/concept_model' | ||
8 | require 'context/concept' | ||
9 | require 'context/query_context' | ||
10 | |||
11 | |||
12 | module Context | ||
13 | @@count = Hash.new { |h,k| h[k] = {} } | ||
14 | @@df = Hash.new { |h,k| h[k] = {} } | ||
15 | @@semaphore = Mutex.new | ||
16 | |||
17 | IndexPaths = { | ||
18 | :web_en => '/mnt/disk2/ClueWeb09_English_1_sDocs', | ||
19 | :web_fr => '/mnt/disk2/ClueWeb09_French_1_sDocs', | ||
20 | :web_nospam => '/mnt/disk1/ClueWeb09_English_1noSpam', | ||
21 | :robust => '/mnt/disk5/Robust04/', | ||
22 | :wt10g => '/mnt/disk3/WT10g_index', | ||
23 | :gov2 => '/mnt/disk3/GOV2_index', | ||
24 | :gigaword => '/local/data/GigaWord/index', | ||
25 | :nyt => '/local/data/NYT_index', | ||
26 | :wiki_en => '/local/data/WikiEn_index', | ||
27 | :wiki_en2012 => '/local/data/WikiEn2012_index', | ||
28 | :wiki_fr => '/local/data/WikiFr_index', | ||
29 | :wiki_tc2012 => '/local/data/INEXQA2012index', | ||
30 | :books => '/local/data/INEX/Books2011/indexedit', | ||
31 | :ent => '/home/sanjuan/works/nist_eval/csiro_indri.ind' | ||
32 | } | ||
33 | |||
34 | IndexPathsCaracole = { | ||
35 | :web_en => '/distant/index_clueweb/disk2/ClueWeb09_English_1_sDocs', | ||
36 | :web_nospam => '/distant/index_clueweb/disk1/ClueWeb09_English_1noSpam', | ||
37 | :robust => '/distant/data/Robust04', | ||
38 | :wt10g => '/distant/index_clueweb/disk3/WT10g_index', | ||
39 | :gov2 => '/distant/index_clueweb/disk3/GOV2_index' | ||
40 | } | ||
41 | |||
42 | # # | ||
43 | # From the SIGKDD 2007 paper : "Exploiting underrepresented query aspects for automatic query expansion" | ||
44 | def Context.query_aspects q | ||
45 | query = Mirimiri::Document.new q | ||
46 | |||
47 | 2.upto(query.words.count) do |size| | ||
48 | query.ngrams(size).each do |s| | ||
49 | dp = Context.count_w Context::IndexPaths[:wiki_en2012],"#1(#{s})" | ||
50 | d = Context.count_w Context::IndexPaths[:wiki_en2012],"#{s}",100000000 | ||
51 | p s | ||
52 | |||
53 | denum = s.split.permutation.inject(0.0) do |res,p| | ||
54 | tmp = (p == s.split) ? 0 : Context.count_w(Context::IndexPaths[:wiki_en2012],"#1(#{p.join(" ")})") | ||
55 | res + tmp | ||
56 | end | ||
57 | |||
58 | existence = dp.to_f/d | ||
59 | support = dp.to_f/denum | ||
60 | puts "#{s} ===> #{existence*support}" | ||
61 | end | ||
62 | end | ||
63 | end | ||
64 | |||
65 | # # | ||
66 | # From the CIKM 2007 paper : "Ranking Very Many Typed Entities on Wikipedia" | ||
67 | # | ||
68 | # The ``entities`` parameter is currently an array of strings. Could be moved | ||
69 | # to an array of Entity objects. | ||
70 | def Context.entity_web_ranking query,entities,nbdocs=100,index='web_nospam' | ||
71 | q = Indri::IndriQuery.new({:query => "#combine ( #{query} )", :count => nbdocs},"-trecFormat=true") | ||
72 | indri_index = Indri::IndriIndex.new IndexPaths[index.to_sym] | ||
73 | docs = indri_index.runquery(q).force_encoding("ISO-8859-1").encode("UTF-8") | ||
74 | query_list = docs.split("\n").collect { |p| p.scan(/\d+ Q0 (.+) \d+ -\d+.\d+ .+/).first }.flatten | ||
75 | |||
76 | res = entities.pmap(15) do |e| | ||
77 | eq = Indri::IndriQuery.new({:query => "#combine ( #{e.gsub(/[^a-zA-Z0-9\s]/,'')} )", :count => nbdocs},"-trecFormat=true") | ||
78 | edocs = indri_index.runquery(eq).force_encoding("ISO-8859-1").encode("UTF-8") | ||
79 | e_list = edocs.split("\n").collect { |p| p.scan(/\d+ Q0 (.+) \d+ -\d+.\d+ .+/).first }.flatten | ||
80 | |||
81 | rels = e_list&query_list | ||
82 | |||
83 | ave_p = 1.upto(nbdocs).inject(0.0) do |sum,k| | ||
84 | p = (e_list.first(k)&rels).count.to_f/k | ||
85 | rel = rels.include?(e_list[k-1]) ? 1.0 : 0.0 | ||
86 | sum + p*rel | ||
87 | end | ||
88 | |||
89 | {:name => e, :score => ave_p} | ||
90 | end | ||
91 | |||
92 | res.sort { |a,b| b[:score] <=> a[:score] } | ||
93 | end | ||
94 | |||
95 | def Context.query_entities query,nb_docs=10 | ||
96 | sources = ['wiki_en2012'] | ||
97 | # sources = ['wiki_en2012','web_nospam','nyt','gigaword'] | ||
98 | # sources = ['web_fr'] | ||
99 | sc = Hash.new { |h,k| h[k] = 0.0 } | ||
100 | |||
101 | sources.each do |source| | ||
102 | puts " == source : #{source}" | ||
103 | c = ConceptModel.new query,source,nb_docs | ||
104 | p c.query | ||
105 | |||
106 | c.concepts.each do |concept| | ||
107 | querys = concept.words[0,4].join " " | ||
108 | |||
109 | d1 = Context::label_candidate querys.sequential_dependence_model,'wiki_en' | ||
110 | d2 = Context::label_candidate querys.sequential_dependence_model,'wiki_en2012' | ||
111 | d3 = Mirimiri::WikipediaPage.search_wikipedia_titles querys | ||
112 | |||
113 | d1 = [] if d1.nil? | ||
114 | d2 = [] if d2.nil? | ||
115 | d3 = [] if d3.nil? | ||
116 | |||
117 | d = d2 & d3 | ||
118 | labels = d.collect { |c| c.downcase.gsub(/[^\w\d]/,' ') } | ||
119 | p d | ||
120 | |||
121 | |||
122 | mins = -10000000 | ||
123 | lab = nil | ||
124 | scores = labels.collect do |l| | ||
125 | s = concept.score_label l | ||
126 | if s > mins | ||
127 | mins = s | ||
128 | lab = l | ||
129 | end | ||
130 | sc[l] += s*(concept.coherence/c.total_coherence) | ||
131 | { :label => l, :score => s } | ||
132 | end | ||
133 | |||
134 | print (concept.coherence/c.total_coherence).to_s+" <= " | ||
135 | p concept.elements.collect { |c| c.word } | ||
136 | end | ||
137 | end | ||
138 | |||
139 | sc.sort { |a,b| b[1] <=> a[1] } | ||
140 | end | ||
141 | |||
142 | def Context.label_candidate query,index,nb_candidates,rm3=false | ||
143 | # Mirimiri::WikipediaPage.search_wikipedia_titles query | ||
144 | args = rm3 ? "-fbDocs=20 -fbTerms=30 -fbMu=2000 -fbOrigWeight=0.7" : "" | ||
145 | q = Indri::IndriQuery.new({:query => query, :count => nb_candidates},"-printDocuments=true -trecFormat=true #{args}") | ||
146 | index = Indri::IndriIndex.new IndexPaths[index.to_sym] | ||
147 | docs = index.runquery q | ||
148 | docs = docs.force_encoding("ISO-8859-1").encode("UTF-8") if ['web_fr','web_en','web_nospam'].include? index | ||
149 | idocs = Indri::IndriPrintedDocuments.new(docs) | ||
150 | |||
151 | wiki_titles = idocs.extract_docs.collect do |d| | ||
152 | t = Nokogiri::HTML d | ||
153 | t.xpath('//title').text | ||
154 | end | ||
155 | |||
156 | wiki_titles | ||
157 | end | ||
158 | |||
159 | def Context.lcm query | ||
160 | source = 'nyt' | ||
161 | |||
162 | a = Time.now | ||
163 | qc = QueryContext.new(1.upto(20).collect do |nb_docs| | ||
164 | beg = Time.now | ||
165 | c = ConceptModel.new query,source,nb_docs | ||
166 | puts "#{nb_docs} ==> Time elapsed: #{Time.now-beg} seconds" | ||
167 | c | ||
168 | end) | ||
169 | puts "All concepts : #{Time.now-a} seconds" | ||
170 | |||
171 | model = qc.best_concept_model | ||
172 | puts "Total : #{Time.now-a} seconds" | ||
173 | model | ||
174 | end | ||
175 | |||
176 | def Context.term_context index_path,query,size,num_page,args={} | ||
177 | terms = self.term_concepts index_path,query,size,num_page,args | ||
178 | args[:window] ||= 1 | ||
179 | |||
180 | # context = "#weight ( #{terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| "#{e[0]} #1(#{e[1]})" }.join " "} ) " unless terms.empty? | ||
181 | context = "#weight ( #{terms.collect { |c| "#{"%.10f" % c[:score]} #uw#{args[:window]}(#{c[:concept]})" }.join " "} ) " unless terms.empty? | ||
182 | |||
183 | context | ||
184 | end | ||
185 | |||
186 | # From SIGIR'06 paper : `Improving the estimation of relevance models using large external corpora` | ||
187 | # | ||
188 | def Context.morm index_path,query,size,num_page | ||
189 | docs,scores,names = self.feedback_docs index_path,query,num_page | ||
190 | |||
191 | terms = [] | ||
192 | |||
193 | docs.each do |d| | ||
194 | r = Mirimiri::Document.new d | ||
195 | tmp = self.extract_ngrams r,:tf,1 | ||
196 | terms += tmp.compact.collect { |t| [t[0]*Math.exp(scores[docs.index(d)].to_f),t[1]] } | ||
197 | end | ||
198 | |||
199 | final = terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| { :score => e[0], :concept => e[1] } } | ||
200 | context = "#weight ( #{final.collect { |c| "#{"%.10f" % c[:score]} #1(#{c[:concept]})" }.join " "} ) " unless terms.empty? | ||
201 | |||
202 | context | ||
203 | end | ||
204 | |||
205 | def Context.term_concepts index_path,query,size,num_page,args={} | ||
206 | args[:func] ||= :entropy | ||
207 | args[:window] ||= 1 | ||
208 | |||
209 | docs = self.feedback_docs index_path,query,num_page | ||
210 | |||
211 | resource = Mirimiri::Document.new docs.join(' ') | ||
212 | terms = self.extract_ngrams resource,args[:func].to_sym,args[:window] | ||
213 | |||
214 | terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| { :score => e[0], :concept => e[1] } } | ||
215 | end | ||
216 | |||
217 | |||
218 | def Context.sentence_similarity s1,s2,index_path | ||
219 | q = s1.is_a?(String) ? s1.split : s1 | ||
220 | r = s2.is_a?(String) ? s2.split : s2 | ||
221 | |||
222 | inter = q & r | ||
223 | |||
224 | s = (inter.count/q.count.to_f) * inter.inject(0.0) { |sum,w| sum + Math.log(Context.df index_path,w) } | ||
225 | s | ||
226 | end | ||
227 | |||
228 | |||
229 | private | ||
230 | |||
231 | def Context.df index_path,w,window=1 | ||
232 | if @@count[index_path]["total#{window}"].nil? | ||
233 | total = `dumpindex #{index_path} s`.match(/total terms:\t(.*)/)[1].to_f-(window-1).to_f | ||
234 | @@semaphore.synchronize { | ||
235 | @@count[index_path]["total#{window}"] = total | ||
236 | } | ||
237 | end | ||
238 | |||
239 | if @@df[index_path]["#uw#{window}(#{w})"].nil? | ||
240 | nb = `dumpindex #{index_path} e "#uw#{window}(#{w})" | awk ' { arr[$1]=$0 } END { for ( key in arr ) { print arr[key] } } ' | wc -l`.chomp.split(':').last.to_f - 1 | ||
241 | @@semaphore.synchronize { | ||
242 | @@df[index_path]["#uw#{window}(#{w})"] = nb+1.0 | ||
243 | } | ||
244 | end | ||
245 | begin | ||
246 | d = @@count[index_path]["total#{window}"]/@@df[index_path]["#uw#{window}(#{w})"] | ||
247 | rescue | ||
248 | puts w | ||
249 | exit | ||
250 | end | ||
251 | d | ||
252 | end | ||
253 | |||
254 | def Context.prob_w index_path,w,window=1 | ||
255 | if @@count[index_path]["total#{window}"].nil? | ||
256 | total = `dumpindex #{index_path} s`.match(/total terms:\t(.*)/)[1].to_f-(window-1).to_f | ||
257 | @@semaphore.synchronize { | ||
258 | @@count[index_path]["total#{window}"] = total+1.0 | ||
259 | } | ||
260 | end | ||
261 | |||
262 | nb = self.count_w index_path,w,window | ||
263 | nb/@@count[index_path]["total#{window}"] | ||
264 | end | ||
265 | |||
266 | def Context.count_w index_path,w,window=1 | ||
267 | if @@count[index_path]["##{window}(#{w})"].nil? | ||
268 | nb = `dumpindex #{index_path} x "##{window}(#{w})"`.chomp.split(':').last.to_f | ||
269 | @@semaphore.synchronize { | ||
270 | @@count[index_path]["##{window}(#{w})"] = ("%.15f" % nb).to_f+1.0 | ||
271 | } | ||
272 | end | ||
273 | @@count[index_path]["##{window}(#{w})"] | ||
274 | end | ||
275 | |||
276 | |||
277 | public | ||
278 | def Context.extract_ngrams resource,func,n | ||
279 | raw_terms = 1.upto(n).collect { |i| resource.ngrams(i) }.flatten | ||
280 | # raw_terms = resource.ngrams(n).flatten | ||
281 | terms = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.any? { |e| e.length <= 2 } || w.split.any? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") || (Mirimiri::Stoplist&w.unaccent.split).count >= 2 } | ||
282 | # terms = raw_terms.uniq.collect { |w| w=w.gsub(/\W/,' ').strip; [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.any? { |e| e.length <= 2 } || w.split.any? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") || (Mirimiri::Stoplist&w.unaccent.split).count >= 1 } | ||
283 | # terms = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.all? { |e| e.length <= 1 } || w.split.all? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") } | ||
284 | terms | ||
285 | end | ||
286 | |||
287 | def Context.feedback_docs index_path,query,num_page | ||
288 | query = Indri::IndriQuery.new({:query => query, :count => num_page},"-printDocuments=true -trecFormat=true") | ||
289 | index = Indri::IndriIndex.new index_path | ||
290 | idocs = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8")) | ||
291 | |||
292 | texts,scores,names = idocs.extract_docs_score | ||
293 | |||
294 | docs = texts.collect do |idoc| | ||
295 | begin | ||
296 | Sanitize.clean idoc,:remove_contents => ['script','style'] | ||
297 | rescue | ||
298 | d = Nokogiri::HTML(idoc) | ||
299 | d.xpath('//text()').text | ||
300 | end | ||
301 | end | ||
302 | |||
303 | return docs,scores,names | ||
304 | end | ||
305 | |||
306 | end | ||
307 |
lib/context/concept.rb
File was created | 1 | #!/usr/bin/env ruby | |
2 | |||
3 | class Concept | ||
4 | attr_reader :elements, :coherence | ||
5 | |||
6 | def initialize | ||
7 | @elements = [] | ||
8 | @coherence = 0 | ||
9 | end | ||
10 | |||
11 | def <<(elem) | ||
12 | raise ArgumentError, 'Argument must be a ConceptualElement.' unless elem.is_a? ConceptualElement | ||
13 | @elements << elem | ||
14 | end | ||
15 | |||
16 | def compute_coherence scores,theta,k#arg=nil | ||
17 | # update_feedback_coherence arg | ||
18 | @coherence = 0.upto(theta.count-1).inject(0.0) do |sum,i| | ||
19 | sum + Math.exp(theta[i][k])*Math.exp(scores[i].to_f) | ||
20 | end | ||
21 | end | ||
22 | |||
23 | def score_label label,index_path=Context::IndexPaths[:wiki_en2012] | ||
24 | s = @elements.inject(0.0) do |res,e| | ||
25 | # *self.prob_w(index_path,"#{w[:word]} #uw10(#{label})") | ||
26 | res + e.prob*Math.log(Context.prob_w(index_path,"#{e.word} #uw10(#{label})")/(e.p_in_coll*Context.prob_w(index_path,label))) | ||
27 | end | ||
28 | |||
29 | s | ||
30 | end | ||
31 | |||
32 | def concept_words_similarity s,index_path=Context::IndexPaths[:wiki_en] | ||
33 | # inter = @elements.collect { |w| w unless (w.word & s).empty? } | ||
34 | inter = self.words & s.words | ||
35 | |||
36 | sim = (inter.count/self.words.count.to_f) * inter.inject(0.0) { |sum,w| sum + Math.log(Context.df index_path,w) } | ||
37 | sim | ||
38 | end | ||
39 | |||
40 | def weighted_concept_similarity s,index_path=Context::IndexPaths[:wiki_en] | ||
41 | inter = self.words & s.words | ||
42 | sim = (inter.count/self.words.count.to_f) | ||
43 | sim *= @elements.inject(0.0) do |sum,w| | ||
44 | wp = s.get_element_from_word w.word | ||
45 | |||
46 | s.words.include?(w.word) ? sum + wp.prob*w.prob*Math.log(Context.df index_path,w.word) : sum + 0.0 | ||
47 | end | ||
48 | sim | ||
49 | end | ||
50 | |||
51 | def get_element_from_word w | ||
52 | @elements.select { |e| e.word == w }.first | ||
53 | end | ||
54 | |||
55 | def words | ||
56 | @elements.collect { |w| w.word } | ||
57 | end | ||
58 | |||
59 | def word_probs | ||
60 | res = {} | ||
61 | @elements.each { |w| res[w.word] = w.prob } | ||
62 | res | ||
63 | end | ||
64 | |||
65 | # From papers : | ||
66 | # NAACL'10: `Automatic Evaluation of Topic Coherence` | ||
67 | # EMNLP'12: `Exploring Topic Coherence over many models and many topics` | ||
68 | # | ||
69 | def uci_coherence epsilon=1,index_path=Context::IndexPaths[:wiki_en] | ||
70 | coherence = @elements.combination(2).inject(0.0) do |res,bigram| | ||
71 | #Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)* | ||
72 | t = (Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)+epsilon)/((bigram.first.p_in_coll)*(bigram.last.p_in_coll)) | ||
73 | res + Math.log(t) | ||
74 | end | ||
75 | |||
76 | coherence /= @elements.count*(@elements.count-1) | ||
77 | coherence | ||
78 | end | ||
79 | |||
80 | def uci_coherence_entropy index_path=Context::IndexPaths[:wiki_en] | ||
81 | coherence = @elements.combination(2).inject(0.0) do |res,bigram| | ||
82 | #Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)* | ||
83 | t = (Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20))/((bigram.first.p_in_coll)*(bigram.last.p_in_coll)) | ||
84 | res + t | ||
85 | end | ||
86 | |||
87 | coherence /= @elements.count*(@elements.count-1) | ||
88 | coherence | ||
89 | end | ||
90 | |||
91 | protected | ||
92 | def update_coherence index_path=Context::IndexPaths[:wiki_en] | ||
93 | coherence = @elements.combination(2).inject(0.0) do |res,bigram| | ||
94 | res + Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}")*Math.log(Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}")/((bigram.first.p_in_coll index_path)*(bigram.last.p_in_coll index_path))) | ||
95 | end | ||
96 | |||
97 | coherence /= @elements.count*(@elements.count-1) | ||
98 | @coherence = coherence | ||
99 | end | ||
100 | |||
101 | def update_feedback_coherence documents | ||
102 | corpus = Mirimiri::Document.new documents.join " " | ||
103 | |||
104 | windows = corpus.ngrams(10).collect { |w| w.split } | ||
105 | |||
106 | coherence = @elements.combination(2).inject(0.0) do |res,bigram| | ||
107 | big_prob = windows.count{ |c| c.include?(bigram.first.word) && c.include?(bigram.last.word) }.to_f/windows.count | ||
108 | mi = big_prob.zero? ? 0.0 : big_prob*bigram.first.prob*bigram.last.prob*Math.log(big_prob/(corpus.tf(bigram.first.word)*corpus.tf(bigram.last.word))) | ||
109 | res + mi | ||
110 | end | ||
111 | |||
112 | coherence /= @elements.count*(@elements.count-1) | ||
113 | @coherence = coherence | ||
114 | end | ||
115 | |||
116 | end | ||
117 |
lib/context/concept_model.rb
File was created | 1 | #!/usr/bin/env ruby | |
2 | |||
3 | require 'lda-ruby' | ||
4 | require 'peach' | ||
5 | |||
6 | class ConceptModel | ||
7 | attr_reader :concepts,:documents,:source,:nbdocs,:nbterms,:query,:total_coherence,:doc_scores,:doc_names,:theta,:entropy_coherence,:avg_coherence | ||
8 | |||
9 | def ConceptModel.parse_hdp str | ||
10 | concepts = [] | ||
11 | eval(str).each do |hdp_top| | ||
12 | c = Concept.new | ||
13 | hdp_top.gsub(/topic \d: /,'').split(" + ").each do |words| | ||
14 | ee = words.split('*') | ||
15 | begin | ||
16 | e = ConceptualElement.new ee[1],ee[0].to_f | ||
17 | c << e | ||
18 | rescue ArgumentError | ||
19 | next | ||
20 | end | ||
21 | end | ||
22 | |||
23 | concepts << c | ||
24 | end | ||
25 | concepts | ||
26 | end | ||
27 | |||
28 | def initialize query,source,nb_docs,nb_terms=10,k=false | ||
29 | raise ArgumentError, 'Argument 1 must be a String.' unless query.is_a? String | ||
30 | raise ArgumentError, 'Argument 2 must be a valid Index key.' unless Context::IndexPaths.has_key?(source.to_sym) | ||
31 | |||
32 | @source = source.to_sym | ||
33 | @nbdocs = nb_docs | ||
34 | @nbterms = nb_terms | ||
35 | @query = query | ||
36 | @concepts = [] | ||
37 | @total_coherence = 0.0 | ||
38 | |||
39 | corpus = Lda::Corpus.new | ||
40 | |||
41 | @documents,@doc_scores,@doc_names = Context.feedback_docs Context::IndexPaths[@source],@query,@nbdocs | ||
42 | @documents.each do |d| | ||
43 | doc = Lda::TextDocument.new corpus,d | ||
44 | corpus.add_document doc | ||
45 | end | ||
46 | |||
47 | if k == false | ||
48 | num_topics = topic_divergence corpus | ||
49 | else | ||
50 | num_topics = k | ||
51 | end | ||
52 | |||
53 | lda = Lda::Lda.new corpus | ||
54 | lda.verbose=false | ||
55 | lda.num_topics = num_topics | ||
56 | |||
57 | lda.em('random') | ||
58 | |||
59 | @beta = lda.beta # to avoid repeated expensive computation | ||
60 | @vocab = lda.vocab # | ||
61 | |||
62 | @theta = lda.compute_topic_document_probability | ||
63 | |||
64 | # Normalizing the phi_t(w) weights for each topic | ||
65 | # | ||
66 | total_prob = {} | ||
67 | tmp_top_word_indices(@nbterms,@vocab,@beta).each_pair do |topic,indices| | ||
68 | total_prob[topic] = indices.inject(0.0) { |res,i| res + Math.exp(@beta[topic][i].to_f) } | ||
69 | end | ||
70 | |||
71 | tmp_top_word_indices(@nbterms,@vocab,@beta).each_pair do |topic,indices| | ||
72 | c = Concept.new | ||
73 | indices.each do |i| | ||
74 | begin | ||
75 | e = ConceptualElement.new @vocab[i],(Math.exp(@beta[topic][i].to_f)/total_prob[topic]) | ||
76 | c << e | ||
77 | rescue ArgumentError | ||
78 | next | ||
79 | end | ||
80 | end | ||
81 | |||
82 | c.compute_coherence @doc_scores,@theta,topic | ||
83 | |||
84 | # c.compute_coherence @doc_scores,gamma_m,topic # takes time since it has to compute several probabilities | ||
85 | @concepts << c | ||
86 | @total_coherence += c.coherence | ||
87 | end | ||
88 | end | ||
89 | |||
90 | def to_s | ||
91 | @concepts.collect do |c| | ||
92 | "#{c.coherence/@total_coherence} => [#{c.elements.collect do |e| | ||
93 | "#{e.prob} #{e.word}" | ||
94 | end.join(', ') | ||
95 | }]" | ||
96 | end.join "\n" | ||
97 | end | ||
98 | |||
99 | def to_indriq | ||
100 | "#weight( #{@concepts.collect do |c| | ||
101 | "#{c.coherence/@total_coherence} #weight ( #{c.elements.collect do |e| | ||
102 | "#{e.prob} #{e.word}" | ||
103 | end.join(' ') | ||
104 | } ) " | ||
105 | end.join " "} )" | ||
106 | end | ||
107 | |||
108 | def <<(concept) | ||
109 | raise ArgumentError, 'Argument must be a Concept.' unless elem.is_a? Concept | ||
110 | @concepts << concept | ||
111 | end | ||
112 | |||
113 | def avg_model_coherence | ||
114 | if @documents.empty? | ||
115 | @avg_coherence = 0.0 | ||
116 | else | ||
117 | @avg_coherence = @concepts.inject(0.0) { |res,c| res + c.uci_coherence }/@concepts.count #if @avg_coherence.nil? | ||
118 | end | ||
119 | @avg_coherence | ||
120 | end | ||
121 | |||
122 | def entropy_model_coherence | ||
123 | if @documents.empty? | ||
124 | @entropy_coherence = 0.0 | ||
125 | else | ||
126 | @entropy_coherence = @concepts.inject(0.0) do |res,c| | ||
127 | ent = c.uci_coherence_entropy | ||
128 | ent += 0.0000000000000000000000001 if ent.zero? | ||
129 | res + ent*Math.log(ent) | ||
130 | end #if @entropy_coherence.nil? | ||
131 | end | ||
132 | @entropy_coherence | ||
133 | end | ||
134 | |||
135 | private | ||
136 | def topic_divergence corpus | ||
137 | max_kl = 0.0 | ||
138 | # Old trick to limit number of iterations | ||
139 | # num_p = @nbdocs < 6 ? @nbdocs + 5 : @nbdocs | ||
140 | |||
141 | semaphore = Mutex.new | ||
142 | |||
143 | 1.upto(20).inject do |k,ntop| | ||
144 | # 1.upto(num_p).inject do |k,ntop| | ||
145 | lda = Lda::Lda.new corpus | ||
146 | lda.verbose=false | ||
147 | lda.num_topics = ntop | ||
148 | lda.em('random') | ||
149 | beta_m = lda.beta # to avoid repeated expensive computation | ||
150 | vocab = lda.vocab | ||
151 | |||
152 | topics_i = Array.new(ntop) { |i| i } | ||
153 | |||
154 | sum_kl = topics_i.combination(2).inject(0.0) do |kl,topics| | ||
155 | ti = topics.first | ||
156 | tj = topics.last | ||
157 | begin | ||
158 | kl + 0.upto(vocab.count-1).inject(0.0) do |res,w_i| | ||
159 | res + ( Math.exp(beta_m[ti][w_i])*Math.log(Math.exp(beta_m[ti][w_i])/Math.exp(beta_m[tj][w_i])) + Math.exp(beta_m[tj][w_i])*Math.log(Math.exp(beta_m[tj][w_i])/Math.exp(beta_m[ti][w_i])) ) | ||
160 | end | ||
161 | rescue | ||
162 | kl + 0.0 | ||
163 | end | ||
164 | end | ||
165 | |||
166 | sum_kl /= ntop*(ntop-1) | ||
167 | sum_kl = max_kl if sum_kl.nan? || sum_kl.infinite? | ||
168 | |||
169 | sum_kl <= max_kl ? k : (max_kl = sum_kl and ntop) | ||
170 | end | ||
171 | end | ||
172 | |||
173 | def tmp_top_word_indices(words_per_topic = 10,vocab,beta) | ||
174 | raise 'No vocabulary loaded.' unless vocab | ||
175 | |||
176 | # find the highest scoring words per topic | ||
177 | topics = Hash.new | ||
178 | indices = (0...vocab.size).to_a | ||
179 | |||
180 | beta.each_with_index do |topic, topic_num| | ||
181 | topics[topic_num] = (topic.zip((0...vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic] | ||
182 | end | ||
183 | |||
184 | topics | ||
185 | end | ||
186 | |||
187 | end | ||
188 |
lib/context/conceptual_element.rb
File was created | 1 | #!/usr/bin/env ruby | |
2 | |||
3 | class ConceptualElement | ||
4 | attr_reader :word, :prob | ||
5 | |||
6 | def initialize w,s | ||
7 | raise ArgumentError, 'Argument 1 must be a String.' unless w.is_a? String | ||
8 | raise ArgumentError, 'Argument 2 must be a Float.' unless s.is_a? Float | ||
9 | |||
10 | tmp = w.gsub(/(-)\1+/,'-').gsub(/([^\w-].*|^-|-$)/,'') | ||
11 | raise ArgumentError, 'Arguments 1 is not a useful word ! ;)' if tmp.is_stopword? || tmp.size < 2 | ||
12 | |||
13 | @word = tmp | ||
14 | @prob = s | ||
15 | end | ||
16 | |||
17 | def p_in_coll index_path=Context::IndexPaths[:wiki_en],size=20 | ||
18 | Context.prob_w index_path,@word,size | ||
19 | end | ||
20 | end | ||
21 |
lib/context/query_context.rb
File was created | 1 | #!/usr/bin/env ruby | |
2 | |||
3 | class QueryContext < Array | ||
4 | |||
5 | def best_concept_model | ||
6 | max_sim = 0.0 | ||
7 | best = nil | ||
8 | |||
9 | for p in 0...self.count | ||
10 | sim = 0.0 | ||
11 | for pp in 0...self.count | ||
12 | next if pp == p | ||
13 | combs = self.at(p).concepts.product self.at(pp).concepts | ||
14 | sum_sim = combs.inject(0.0) { |sum,k| sum + k.first.weighted_concept_similarity(k.last) } | ||
15 | sim += sum_sim/combs.count | ||
16 | end | ||
17 | |||
18 | |||
19 | if sim > max_sim | ||
20 | max_sim = sim | ||
21 | best = p | ||
22 | end | ||
23 | end | ||
24 | |||
25 | best.nil? ? nil : self.at(best) | ||
26 | end | ||
27 | |||
28 | def best_concept_model_word | ||
29 | max_sim = 0.0 | ||
30 | best = nil | ||
31 | |||
32 | for p in 0...self.count | ||
33 | sim = 0.0 | ||
34 | for pp in 0...self.count | ||
35 | next if pp == p | ||
36 | combs = self.at(p).concepts.product self.at(pp).concepts | ||
37 | sum_sim = combs.inject(0.0) { |sum,k| sum + k.first.concept_words_similarity(k.last) } | ||
38 | sim += sum_sim/combs.count | ||
39 | end | ||
40 | |||
41 | |||
42 | if sim > max_sim | ||
43 | max_sim = sim | ||
44 | best = p | ||
45 | end | ||
46 | end | ||
47 | |||
48 | best.nil? ? nil : self.at(best) | ||
49 | end | ||
50 | end | ||
51 |