Commit 65040e3e69f49124aec05e30451769bb1c5f9d01

Authored by Romain Deveaud
1 parent e55598814a
Exists in master

changes made for the oair paper

Showing 7 changed files with 681 additions and 63 deletions Inline Diff

1 # context 1 # context
2 2
3 Copyright (C) 2012 Romain Deveaud <romain.deveaud@gmail.com> 3 Copyright (C) 2012 Romain Deveaud <romain.deveaud@gmail.com>
4 4
5 Stand-alone functions built on top of mirimiri and lda-ruby. 5 Stand-alone functions built on top of mirimiri and lda-ruby.
6 Aiming to extract contextual features from general corpora related to search scenario. 6 Aiming to extract contextual features from general corpora related to search scenario.
7 7
8 This code was used to produce the results reported in our OAIR'13 paper : "Unsupervised Latent Concept Modeling to Identify Query Facets".
9
8 License 10 License
9 ======= 11 =======
10 12
11 This program is free software: you can redistribute it and/or modify 13 This program is free software: you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by 14 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation, either version 3 of the License, or 15 the Free Software Foundation, either version 3 of the License, or
14 (at your option) any later version. 16 (at your option) any later version.
15 17
16 This program is distributed in the hope that it will be useful, 18 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of 19 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 20 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details. 21 GNU General Public License for more details.
20 22
21 You should have received a copy of the GNU General Public License 23 You should have received a copy of the GNU General Public License
22 along with this program. If not, see <http://www.gnu.org/licenses/>. 24 along with this program. If not, see <http://www.gnu.org/licenses/>.
23 25
1 #!/usr/bin/env ruby File was deleted
2
3 require 'mirimiri'
4 require 'sanitize'
5 require 'lda-ruby'
6
7 module Context
8 IndexPaths = {
9 :web_en => '/mnt/disk2/ClueWeb09_English_1_sDocs',
10 :web_fr => '/mnt/disk2/ClueWeb09_French_1_sDocs',
11 :web_nospam => '/mnt/disk1/ClueWeb09_English_1noSpam',
12 :gigaword => '/local/data/GigaWord/index',
13 :nyt => '/local/data/NYT_index',
14 :wiki_en => '/local/data/WikiEn_index',
15 :wiki_fr => '/local/data/WikiFr_index'
16 }
17
18 def Context.term_context index_path,query,size,num_page,args={}
19 args[:func] ||= :entropy
20 args[:window] ||= 1
21
22 docs = self.feedback_docs index_path,query,num_page
23
24 resource = Mirimiri::Document.new docs.join(' ')
25 terms = self.extract_ngrams resource,args[:func].to_sym,args[:window]
26
27 context = "#weight ( #{terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| "#{e[0]} #1(#{e[1]})" }.join " "} ) " unless terms.empty?
28
29 context
30 end
31
32 def Context.topic_context index_path,query,size,num_page,args={}
33 corpus = Lda::Corpus.new
34
35 docs = self.feedback_docs index_path,query,num_page
36 docs.each do |d|
37 doc = Lda::TextDocument.new corpus,d
38 corpus.add_document doc
39 end
40
41 lda = Lda::Lda.new corpus
42 lda.num_topics = num_page/10
43 lda.em 'random'
44 puts lda.top_words(size)
45 end
46
47 private
48 def Context.feedback_docs index_path,query,num_page
49 query = Indri::IndriQuery.new({:query => query, :count => num_page},"-printDocuments=true -trecFormat=true")
50 index = Indri::IndriIndex.new index_path
51 idocs = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))
52
53 docs = idocs.extract_docs.collect { |idoc| Sanitize.clean idoc,:remove_contents => ['script'] }
54 docs
55 end
56
57 def Context.extract_ngrams resource,func,n
58 raw_terms = 1.upto(n).collect { |i| resource.ngrams(i) }.flatten
59 terms = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.all? { |e| e.length <= 1 } || w.split.all? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") }
60 terms
61 end
62
63 end
64 1 #!/usr/bin/env ruby
File was created 1 #!/usr/bin/env ruby
2
3 require 'mirimiri'
4 require 'sanitize'
5 require 'lda-ruby'
6 require 'context/conceptual_element'
7 require 'context/concept_model'
8 require 'context/concept'
9 require 'context/query_context'
10
11
12 module Context
13 @@count = Hash.new { |h,k| h[k] = {} }
14 @@df = Hash.new { |h,k| h[k] = {} }
15 @@semaphore = Mutex.new
16
17 IndexPaths = {
18 :web_en => '/mnt/disk2/ClueWeb09_English_1_sDocs',
19 :web_fr => '/mnt/disk2/ClueWeb09_French_1_sDocs',
20 :web_nospam => '/mnt/disk1/ClueWeb09_English_1noSpam',
21 :robust => '/mnt/disk5/Robust04/',
22 :wt10g => '/mnt/disk3/WT10g_index',
23 :gov2 => '/mnt/disk3/GOV2_index',
24 :gigaword => '/local/data/GigaWord/index',
25 :nyt => '/local/data/NYT_index',
26 :wiki_en => '/local/data/WikiEn_index',
27 :wiki_en2012 => '/local/data/WikiEn2012_index',
28 :wiki_fr => '/local/data/WikiFr_index',
29 :wiki_tc2012 => '/local/data/INEXQA2012index',
30 :books => '/local/data/INEX/Books2011/indexedit',
31 :ent => '/home/sanjuan/works/nist_eval/csiro_indri.ind'
32 }
33
34 IndexPathsCaracole = {
35 :web_en => '/distant/index_clueweb/disk2/ClueWeb09_English_1_sDocs',
36 :web_nospam => '/distant/index_clueweb/disk1/ClueWeb09_English_1noSpam',
37 :robust => '/distant/data/Robust04',
38 :wt10g => '/distant/index_clueweb/disk3/WT10g_index',
39 :gov2 => '/distant/index_clueweb/disk3/GOV2_index'
40 }
41
42 # #
43 # From the SIGKDD 2007 paper : "Exploiting underrepresented query aspects for automatic query expansion"
44 def Context.query_aspects q
45 query = Mirimiri::Document.new q
46
47 2.upto(query.words.count) do |size|
48 query.ngrams(size).each do |s|
49 dp = Context.count_w Context::IndexPaths[:wiki_en2012],"#1(#{s})"
50 d = Context.count_w Context::IndexPaths[:wiki_en2012],"#{s}",100000000
51 p s
52
53 denum = s.split.permutation.inject(0.0) do |res,p|
54 tmp = (p == s.split) ? 0 : Context.count_w(Context::IndexPaths[:wiki_en2012],"#1(#{p.join(" ")})")
55 res + tmp
56 end
57
58 existence = dp.to_f/d
59 support = dp.to_f/denum
60 puts "#{s} ===> #{existence*support}"
61 end
62 end
63 end
64
65 # #
66 # From the CIKM 2007 paper : "Ranking Very Many Typed Entities on Wikipedia"
67 #
68 # The ``entities`` parameter is currently an array of strings. Could be moved
69 # to an array of Entity objects.
70 def Context.entity_web_ranking query,entities,nbdocs=100,index='web_nospam'
71 q = Indri::IndriQuery.new({:query => "#combine ( #{query} )", :count => nbdocs},"-trecFormat=true")
72 indri_index = Indri::IndriIndex.new IndexPaths[index.to_sym]
73 docs = indri_index.runquery(q).force_encoding("ISO-8859-1").encode("UTF-8")
74 query_list = docs.split("\n").collect { |p| p.scan(/\d+ Q0 (.+) \d+ -\d+.\d+ .+/).first }.flatten
75
76 res = entities.pmap(15) do |e|
77 eq = Indri::IndriQuery.new({:query => "#combine ( #{e.gsub(/[^a-zA-Z0-9\s]/,'')} )", :count => nbdocs},"-trecFormat=true")
78 edocs = indri_index.runquery(eq).force_encoding("ISO-8859-1").encode("UTF-8")
79 e_list = edocs.split("\n").collect { |p| p.scan(/\d+ Q0 (.+) \d+ -\d+.\d+ .+/).first }.flatten
80
81 rels = e_list&query_list
82
83 ave_p = 1.upto(nbdocs).inject(0.0) do |sum,k|
84 p = (e_list.first(k)&rels).count.to_f/k
85 rel = rels.include?(e_list[k-1]) ? 1.0 : 0.0
86 sum + p*rel
87 end
88
89 {:name => e, :score => ave_p}
90 end
91
92 res.sort { |a,b| b[:score] <=> a[:score] }
93 end
94
95 def Context.query_entities query,nb_docs=10
96 sources = ['wiki_en2012']
97 # sources = ['wiki_en2012','web_nospam','nyt','gigaword']
98 # sources = ['web_fr']
99 sc = Hash.new { |h,k| h[k] = 0.0 }
100
101 sources.each do |source|
102 puts " == source : #{source}"
103 c = ConceptModel.new query,source,nb_docs
104 p c.query
105
106 c.concepts.each do |concept|
107 querys = concept.words[0,4].join " "
108
109 d1 = Context::label_candidate querys.sequential_dependence_model,'wiki_en'
110 d2 = Context::label_candidate querys.sequential_dependence_model,'wiki_en2012'
111 d3 = Mirimiri::WikipediaPage.search_wikipedia_titles querys
112
113 d1 = [] if d1.nil?
114 d2 = [] if d2.nil?
115 d3 = [] if d3.nil?
116
117 d = d2 & d3
118 labels = d.collect { |c| c.downcase.gsub(/[^\w\d]/,' ') }
119 p d
120
121
122 mins = -10000000
123 lab = nil
124 scores = labels.collect do |l|
125 s = concept.score_label l
126 if s > mins
127 mins = s
128 lab = l
129 end
130 sc[l] += s*(concept.coherence/c.total_coherence)
131 { :label => l, :score => s }
132 end
133
134 print (concept.coherence/c.total_coherence).to_s+" <= "
135 p concept.elements.collect { |c| c.word }
136 end
137 end
138
139 sc.sort { |a,b| b[1] <=> a[1] }
140 end
141
142 def Context.label_candidate query,index,nb_candidates,rm3=false
143 # Mirimiri::WikipediaPage.search_wikipedia_titles query
144 args = rm3 ? "-fbDocs=20 -fbTerms=30 -fbMu=2000 -fbOrigWeight=0.7" : ""
145 q = Indri::IndriQuery.new({:query => query, :count => nb_candidates},"-printDocuments=true -trecFormat=true #{args}")
146 index = Indri::IndriIndex.new IndexPaths[index.to_sym]
147 docs = index.runquery q
148 docs = docs.force_encoding("ISO-8859-1").encode("UTF-8") if ['web_fr','web_en','web_nospam'].include? index
149 idocs = Indri::IndriPrintedDocuments.new(docs)
150
151 wiki_titles = idocs.extract_docs.collect do |d|
152 t = Nokogiri::HTML d
153 t.xpath('//title').text
154 end
155
156 wiki_titles
157 end
158
159 def Context.lcm query
160 source = 'nyt'
161
162 a = Time.now
163 qc = QueryContext.new(1.upto(20).collect do |nb_docs|
164 beg = Time.now
165 c = ConceptModel.new query,source,nb_docs
166 puts "#{nb_docs} ==> Time elapsed: #{Time.now-beg} seconds"
167 c
168 end)
169 puts "All concepts : #{Time.now-a} seconds"
170
171 model = qc.best_concept_model
172 puts "Total : #{Time.now-a} seconds"
173 model
174 end
175
176 def Context.term_context index_path,query,size,num_page,args={}
177 terms = self.term_concepts index_path,query,size,num_page,args
178 args[:window] ||= 1
179
180 # context = "#weight ( #{terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| "#{e[0]} #1(#{e[1]})" }.join " "} ) " unless terms.empty?
181 context = "#weight ( #{terms.collect { |c| "#{"%.10f" % c[:score]} #uw#{args[:window]}(#{c[:concept]})" }.join " "} ) " unless terms.empty?
182
183 context
184 end
185
186 # From SIGIR'06 paper : `Improving the estimation of relevance models using large external corpora`
187 #
188 def Context.morm index_path,query,size,num_page
189 docs,scores,names = self.feedback_docs index_path,query,num_page
190
191 terms = []
192
193 docs.each do |d|
194 r = Mirimiri::Document.new d
195 tmp = self.extract_ngrams r,:tf,1
196 terms += tmp.compact.collect { |t| [t[0]*Math.exp(scores[docs.index(d)].to_f),t[1]] }
197 end
198
199 final = terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| { :score => e[0], :concept => e[1] } }
200 context = "#weight ( #{final.collect { |c| "#{"%.10f" % c[:score]} #1(#{c[:concept]})" }.join " "} ) " unless terms.empty?
201
202 context
203 end
204
205 def Context.term_concepts index_path,query,size,num_page,args={}
206 args[:func] ||= :entropy
207 args[:window] ||= 1
208
209 docs = self.feedback_docs index_path,query,num_page
210
211 resource = Mirimiri::Document.new docs.join(' ')
212 terms = self.extract_ngrams resource,args[:func].to_sym,args[:window]
213
214 terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| { :score => e[0], :concept => e[1] } }
215 end
216
217
218 def Context.sentence_similarity s1,s2,index_path
219 q = s1.is_a?(String) ? s1.split : s1
220 r = s2.is_a?(String) ? s2.split : s2
221
222 inter = q & r
223
224 s = (inter.count/q.count.to_f) * inter.inject(0.0) { |sum,w| sum + Math.log(Context.df index_path,w) }
225 s
226 end
227
228
229 private
230
231 def Context.df index_path,w,window=1
232 if @@count[index_path]["total#{window}"].nil?
233 total = `dumpindex #{index_path} s`.match(/total terms:\t(.*)/)[1].to_f-(window-1).to_f
234 @@semaphore.synchronize {
235 @@count[index_path]["total#{window}"] = total
236 }
237 end
238
239 if @@df[index_path]["#uw#{window}(#{w})"].nil?
240 nb = `dumpindex #{index_path} e "#uw#{window}(#{w})" | awk ' { arr[$1]=$0 } END { for ( key in arr ) { print arr[key] } } ' | wc -l`.chomp.split(':').last.to_f - 1
241 @@semaphore.synchronize {
242 @@df[index_path]["#uw#{window}(#{w})"] = nb+1.0
243 }
244 end
245 begin
246 d = @@count[index_path]["total#{window}"]/@@df[index_path]["#uw#{window}(#{w})"]
247 rescue
248 puts w
249 exit
250 end
251 d
252 end
253
254 def Context.prob_w index_path,w,window=1
255 if @@count[index_path]["total#{window}"].nil?
256 total = `dumpindex #{index_path} s`.match(/total terms:\t(.*)/)[1].to_f-(window-1).to_f
257 @@semaphore.synchronize {
258 @@count[index_path]["total#{window}"] = total+1.0
259 }
260 end
261
262 nb = self.count_w index_path,w,window
263 nb/@@count[index_path]["total#{window}"]
264 end
265
266 def Context.count_w index_path,w,window=1
267 if @@count[index_path]["##{window}(#{w})"].nil?
268 nb = `dumpindex #{index_path} x "##{window}(#{w})"`.chomp.split(':').last.to_f
269 @@semaphore.synchronize {
270 @@count[index_path]["##{window}(#{w})"] = ("%.15f" % nb).to_f+1.0
271 }
272 end
273 @@count[index_path]["##{window}(#{w})"]
274 end
275
276
277 public
278 def Context.extract_ngrams resource,func,n
279 raw_terms = 1.upto(n).collect { |i| resource.ngrams(i) }.flatten
280 # raw_terms = resource.ngrams(n).flatten
281 terms = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.any? { |e| e.length <= 2 } || w.split.any? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") || (Mirimiri::Stoplist&w.unaccent.split).count >= 2 }
282 # terms = raw_terms.uniq.collect { |w| w=w.gsub(/\W/,' ').strip; [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.any? { |e| e.length <= 2 } || w.split.any? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") || (Mirimiri::Stoplist&w.unaccent.split).count >= 1 }
283 # terms = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.all? { |e| e.length <= 1 } || w.split.all? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") }
284 terms
285 end
286
287 def Context.feedback_docs index_path,query,num_page
288 query = Indri::IndriQuery.new({:query => query, :count => num_page},"-printDocuments=true -trecFormat=true")
289 index = Indri::IndriIndex.new index_path
290 idocs = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))
291
292 texts,scores,names = idocs.extract_docs_score
293
294 docs = texts.collect do |idoc|
295 begin
296 Sanitize.clean idoc,:remove_contents => ['script','style']
297 rescue
298 d = Nokogiri::HTML(idoc)
299 d.xpath('//text()').text
300 end
301 end
302
303 return docs,scores,names
304 end
305
306 end
307
lib/context/concept.rb
File was created 1 #!/usr/bin/env ruby
2
3 class Concept
4 attr_reader :elements, :coherence
5
6 def initialize
7 @elements = []
8 @coherence = 0
9 end
10
11 def <<(elem)
12 raise ArgumentError, 'Argument must be a ConceptualElement.' unless elem.is_a? ConceptualElement
13 @elements << elem
14 end
15
16 def compute_coherence scores,theta,k#arg=nil
17 # update_feedback_coherence arg
18 @coherence = 0.upto(theta.count-1).inject(0.0) do |sum,i|
19 sum + Math.exp(theta[i][k])*Math.exp(scores[i].to_f)
20 end
21 end
22
23 def score_label label,index_path=Context::IndexPaths[:wiki_en2012]
24 s = @elements.inject(0.0) do |res,e|
25 # *self.prob_w(index_path,"#{w[:word]} #uw10(#{label})")
26 res + e.prob*Math.log(Context.prob_w(index_path,"#{e.word} #uw10(#{label})")/(e.p_in_coll*Context.prob_w(index_path,label)))
27 end
28
29 s
30 end
31
32 def concept_words_similarity s,index_path=Context::IndexPaths[:wiki_en]
33 # inter = @elements.collect { |w| w unless (w.word & s).empty? }
34 inter = self.words & s.words
35
36 sim = (inter.count/self.words.count.to_f) * inter.inject(0.0) { |sum,w| sum + Math.log(Context.df index_path,w) }
37 sim
38 end
39
40 def weighted_concept_similarity s,index_path=Context::IndexPaths[:wiki_en]
41 inter = self.words & s.words
42 sim = (inter.count/self.words.count.to_f)
43 sim *= @elements.inject(0.0) do |sum,w|
44 wp = s.get_element_from_word w.word
45
46 s.words.include?(w.word) ? sum + wp.prob*w.prob*Math.log(Context.df index_path,w.word) : sum + 0.0
47 end
48 sim
49 end
50
51 def get_element_from_word w
52 @elements.select { |e| e.word == w }.first
53 end
54
55 def words
56 @elements.collect { |w| w.word }
57 end
58
59 def word_probs
60 res = {}
61 @elements.each { |w| res[w.word] = w.prob }
62 res
63 end
64
65 # From papers :
66 # NAACL'10: `Automatic Evaluation of Topic Coherence`
67 # EMNLP'12: `Exploring Topic Coherence over many models and many topics`
68 #
69 def uci_coherence epsilon=1,index_path=Context::IndexPaths[:wiki_en]
70 coherence = @elements.combination(2).inject(0.0) do |res,bigram|
71 #Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)*
72 t = (Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)+epsilon)/((bigram.first.p_in_coll)*(bigram.last.p_in_coll))
73 res + Math.log(t)
74 end
75
76 coherence /= @elements.count*(@elements.count-1)
77 coherence
78 end
79
80 def uci_coherence_entropy index_path=Context::IndexPaths[:wiki_en]
81 coherence = @elements.combination(2).inject(0.0) do |res,bigram|
82 #Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)*
83 t = (Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20))/((bigram.first.p_in_coll)*(bigram.last.p_in_coll))
84 res + t
85 end
86
87 coherence /= @elements.count*(@elements.count-1)
88 coherence
89 end
90
91 protected
92 def update_coherence index_path=Context::IndexPaths[:wiki_en]
93 coherence = @elements.combination(2).inject(0.0) do |res,bigram|
94 res + Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}")*Math.log(Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}")/((bigram.first.p_in_coll index_path)*(bigram.last.p_in_coll index_path)))
95 end
96
97 coherence /= @elements.count*(@elements.count-1)
98 @coherence = coherence
99 end
100
101 def update_feedback_coherence documents
102 corpus = Mirimiri::Document.new documents.join " "
103
104 windows = corpus.ngrams(10).collect { |w| w.split }
105
106 coherence = @elements.combination(2).inject(0.0) do |res,bigram|
107 big_prob = windows.count{ |c| c.include?(bigram.first.word) && c.include?(bigram.last.word) }.to_f/windows.count
108 mi = big_prob.zero? ? 0.0 : big_prob*bigram.first.prob*bigram.last.prob*Math.log(big_prob/(corpus.tf(bigram.first.word)*corpus.tf(bigram.last.word)))
109 res + mi
110 end
111
112 coherence /= @elements.count*(@elements.count-1)
113 @coherence = coherence
114 end
115
116 end
117
lib/context/concept_model.rb
File was created 1 #!/usr/bin/env ruby
2
3 require 'lda-ruby'
4 require 'peach'
5
6 class ConceptModel
7 attr_reader :concepts,:documents,:source,:nbdocs,:nbterms,:query,:total_coherence,:doc_scores,:doc_names,:theta,:entropy_coherence,:avg_coherence
8
9 def ConceptModel.parse_hdp str
10 concepts = []
11 eval(str).each do |hdp_top|
12 c = Concept.new
13 hdp_top.gsub(/topic \d: /,'').split(" + ").each do |words|
14 ee = words.split('*')
15 begin
16 e = ConceptualElement.new ee[1],ee[0].to_f
17 c << e
18 rescue ArgumentError
19 next
20 end
21 end
22
23 concepts << c
24 end
25 concepts
26 end
27
28 def initialize query,source,nb_docs,nb_terms=10,k=false
29 raise ArgumentError, 'Argument 1 must be a String.' unless query.is_a? String
30 raise ArgumentError, 'Argument 2 must be a valid Index key.' unless Context::IndexPaths.has_key?(source.to_sym)
31
32 @source = source.to_sym
33 @nbdocs = nb_docs
34 @nbterms = nb_terms
35 @query = query
36 @concepts = []
37 @total_coherence = 0.0
38
39 corpus = Lda::Corpus.new
40
41 @documents,@doc_scores,@doc_names = Context.feedback_docs Context::IndexPaths[@source],@query,@nbdocs
42 @documents.each do |d|
43 doc = Lda::TextDocument.new corpus,d
44 corpus.add_document doc
45 end
46
47 if k == false
48 num_topics = topic_divergence corpus
49 else
50 num_topics = k
51 end
52
53 lda = Lda::Lda.new corpus
54 lda.verbose=false
55 lda.num_topics = num_topics
56
57 lda.em('random')
58
59 @beta = lda.beta # to avoid repeated expensive computation
60 @vocab = lda.vocab #
61
62 @theta = lda.compute_topic_document_probability
63
64 # Normalizing the phi_t(w) weights for each topic
65 #
66 total_prob = {}
67 tmp_top_word_indices(@nbterms,@vocab,@beta).each_pair do |topic,indices|
68 total_prob[topic] = indices.inject(0.0) { |res,i| res + Math.exp(@beta[topic][i].to_f) }
69 end
70
71 tmp_top_word_indices(@nbterms,@vocab,@beta).each_pair do |topic,indices|
72 c = Concept.new
73 indices.each do |i|
74 begin
75 e = ConceptualElement.new @vocab[i],(Math.exp(@beta[topic][i].to_f)/total_prob[topic])
76 c << e
77 rescue ArgumentError
78 next
79 end
80 end
81
82 c.compute_coherence @doc_scores,@theta,topic
83
84 # c.compute_coherence @doc_scores,gamma_m,topic # takes time since it has to compute several probabilities
85 @concepts << c
86 @total_coherence += c.coherence
87 end
88 end
89
90 def to_s
91 @concepts.collect do |c|
92 "#{c.coherence/@total_coherence} => [#{c.elements.collect do |e|
93 "#{e.prob} #{e.word}"
94 end.join(', ')
95 }]"
96 end.join "\n"
97 end
98
99 def to_indriq
100 "#weight( #{@concepts.collect do |c|
101 "#{c.coherence/@total_coherence} #weight ( #{c.elements.collect do |e|
102 "#{e.prob} #{e.word}"
103 end.join(' ')
104 } ) "
105 end.join " "} )"
106 end
107
108 def <<(concept)
109 raise ArgumentError, 'Argument must be a Concept.' unless elem.is_a? Concept
110 @concepts << concept
111 end
112
113 def avg_model_coherence
114 if @documents.empty?
115 @avg_coherence = 0.0
116 else
117 @avg_coherence = @concepts.inject(0.0) { |res,c| res + c.uci_coherence }/@concepts.count #if @avg_coherence.nil?
118 end
119 @avg_coherence
120 end
121
122 def entropy_model_coherence
123 if @documents.empty?
124 @entropy_coherence = 0.0
125 else
126 @entropy_coherence = @concepts.inject(0.0) do |res,c|
127 ent = c.uci_coherence_entropy
128 ent += 0.0000000000000000000000001 if ent.zero?
129 res + ent*Math.log(ent)
130 end #if @entropy_coherence.nil?
131 end
132 @entropy_coherence
133 end
134
135 private
136 def topic_divergence corpus
137 max_kl = 0.0
138 # Old trick to limit number of iterations
139 # num_p = @nbdocs < 6 ? @nbdocs + 5 : @nbdocs
140
141 semaphore = Mutex.new
142
143 1.upto(20).inject do |k,ntop|
144 # 1.upto(num_p).inject do |k,ntop|
145 lda = Lda::Lda.new corpus
146 lda.verbose=false
147 lda.num_topics = ntop
148 lda.em('random')
149 beta_m = lda.beta # to avoid repeated expensive computation
150 vocab = lda.vocab
151
152 topics_i = Array.new(ntop) { |i| i }
153
154 sum_kl = topics_i.combination(2).inject(0.0) do |kl,topics|
155 ti = topics.first
156 tj = topics.last
157 begin
158 kl + 0.upto(vocab.count-1).inject(0.0) do |res,w_i|
159 res + ( Math.exp(beta_m[ti][w_i])*Math.log(Math.exp(beta_m[ti][w_i])/Math.exp(beta_m[tj][w_i])) + Math.exp(beta_m[tj][w_i])*Math.log(Math.exp(beta_m[tj][w_i])/Math.exp(beta_m[ti][w_i])) )
160 end
161 rescue
162 kl + 0.0
163 end
164 end
165
166 sum_kl /= ntop*(ntop-1)
167 sum_kl = max_kl if sum_kl.nan? || sum_kl.infinite?
168
169 sum_kl <= max_kl ? k : (max_kl = sum_kl and ntop)
170 end
171 end
172
173 def tmp_top_word_indices(words_per_topic = 10,vocab,beta)
174 raise 'No vocabulary loaded.' unless vocab
175
176 # find the highest scoring words per topic
177 topics = Hash.new
178 indices = (0...vocab.size).to_a
179
180 beta.each_with_index do |topic, topic_num|
181 topics[topic_num] = (topic.zip((0...vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
182 end
183
184 topics
185 end
186
187 end
188
lib/context/conceptual_element.rb
File was created 1 #!/usr/bin/env ruby
2
3 class ConceptualElement
4 attr_reader :word, :prob
5
6 def initialize w,s
7 raise ArgumentError, 'Argument 1 must be a String.' unless w.is_a? String
8 raise ArgumentError, 'Argument 2 must be a Float.' unless s.is_a? Float
9
10 tmp = w.gsub(/(-)\1+/,'-').gsub(/([^\w-].*|^-|-$)/,'')
11 raise ArgumentError, 'Arguments 1 is not a useful word ! ;)' if tmp.is_stopword? || tmp.size < 2
12
13 @word = tmp
14 @prob = s
15 end
16
17 def p_in_coll index_path=Context::IndexPaths[:wiki_en],size=20
18 Context.prob_w index_path,@word,size
19 end
20 end
21
lib/context/query_context.rb
File was created 1 #!/usr/bin/env ruby
2
3 class QueryContext < Array
4
5 def best_concept_model
6 max_sim = 0.0
7 best = nil
8
9 for p in 0...self.count
10 sim = 0.0
11 for pp in 0...self.count
12 next if pp == p
13 combs = self.at(p).concepts.product self.at(pp).concepts
14 sum_sim = combs.inject(0.0) { |sum,k| sum + k.first.weighted_concept_similarity(k.last) }
15 sim += sum_sim/combs.count
16 end
17
18
19 if sim > max_sim
20 max_sim = sim
21 best = p
22 end
23 end
24
25 best.nil? ? nil : self.at(best)
26 end
27
28 def best_concept_model_word
29 max_sim = 0.0
30 best = nil
31
32 for p in 0...self.count
33 sim = 0.0
34 for pp in 0...self.count
35 next if pp == p
36 combs = self.at(p).concepts.product self.at(pp).concepts
37 sum_sim = combs.inject(0.0) { |sum,k| sum + k.first.concept_words_similarity(k.last) }
38 sim += sum_sim/combs.count
39 end
40
41
42 if sim > max_sim
43 max_sim = sim
44 best = p
45 end
46 end
47
48 best.nil? ? nil : self.at(best)
49 end
50 end
51