Commit 65040e3e69f49124aec05e30451769bb1c5f9d01

Authored by Romain Deveaud
1 parent e55598814a
Exists in master

changes made for the oair paper

Showing 7 changed files with 681 additions and 63 deletions Side-by-side Diff

... ... @@ -5,6 +5,8 @@
5 5 Stand-alone functions built on top of mirimiri and lda-ruby.
6 6 Aiming to extract contextual features from general corpora related to search scenario.
7 7  
  8 +This code was used to produce the results reported in our OAIR'13 paper : "Unsupervised Latent Concept Modeling to Identify Query Facets".
  9 +
8 10 License
9 11 =======
10 12  
1   -#!/usr/bin/env ruby
2   -
3   -require 'mirimiri'
4   -require 'sanitize'
5   -require 'lda-ruby'
6   -
7   -module Context
8   - IndexPaths = {
9   - :web_en => '/mnt/disk2/ClueWeb09_English_1_sDocs',
10   - :web_fr => '/mnt/disk2/ClueWeb09_French_1_sDocs',
11   - :web_nospam => '/mnt/disk1/ClueWeb09_English_1noSpam',
12   - :gigaword => '/local/data/GigaWord/index',
13   - :nyt => '/local/data/NYT_index',
14   - :wiki_en => '/local/data/WikiEn_index',
15   - :wiki_fr => '/local/data/WikiFr_index'
16   - }
17   -
18   - def Context.term_context index_path,query,size,num_page,args={}
19   - args[:func] ||= :entropy
20   - args[:window] ||= 1
21   -
22   - docs = self.feedback_docs index_path,query,num_page
23   -
24   - resource = Mirimiri::Document.new docs.join(' ')
25   - terms = self.extract_ngrams resource,args[:func].to_sym,args[:window]
26   -
27   - context = "#weight ( #{terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| "#{e[0]} #1(#{e[1]})" }.join " "} ) " unless terms.empty?
28   -
29   - context
30   - end
31   -
32   - def Context.topic_context index_path,query,size,num_page,args={}
33   - corpus = Lda::Corpus.new
34   -
35   - docs = self.feedback_docs index_path,query,num_page
36   - docs.each do |d|
37   - doc = Lda::TextDocument.new corpus,d
38   - corpus.add_document doc
39   - end
40   -
41   - lda = Lda::Lda.new corpus
42   - lda.num_topics = num_page/10
43   - lda.em 'random'
44   - puts lda.top_words(size)
45   - end
46   -
47   - private
48   - def Context.feedback_docs index_path,query,num_page
49   - query = Indri::IndriQuery.new({:query => query, :count => num_page},"-printDocuments=true -trecFormat=true")
50   - index = Indri::IndriIndex.new index_path
51   - idocs = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))
52   -
53   - docs = idocs.extract_docs.collect { |idoc| Sanitize.clean idoc,:remove_contents => ['script'] }
54   - docs
55   - end
56   -
57   - def Context.extract_ngrams resource,func,n
58   - raw_terms = 1.upto(n).collect { |i| resource.ngrams(i) }.flatten
59   - terms = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.all? { |e| e.length <= 1 } || w.split.all? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") }
60   - terms
61   - end
62   -
63   -end
  1 +#!/usr/bin/env ruby
  2 +
  3 +require 'mirimiri'
  4 +require 'sanitize'
  5 +require 'lda-ruby'
  6 +require 'context/conceptual_element'
  7 +require 'context/concept_model'
  8 +require 'context/concept'
  9 +require 'context/query_context'
  10 +
  11 +
  12 +module Context
  13 + @@count = Hash.new { |h,k| h[k] = {} }
  14 + @@df = Hash.new { |h,k| h[k] = {} }
  15 + @@semaphore = Mutex.new
  16 +
  17 + IndexPaths = {
  18 + :web_en => '/mnt/disk2/ClueWeb09_English_1_sDocs',
  19 + :web_fr => '/mnt/disk2/ClueWeb09_French_1_sDocs',
  20 + :web_nospam => '/mnt/disk1/ClueWeb09_English_1noSpam',
  21 + :robust => '/mnt/disk5/Robust04/',
  22 + :wt10g => '/mnt/disk3/WT10g_index',
  23 + :gov2 => '/mnt/disk3/GOV2_index',
  24 + :gigaword => '/local/data/GigaWord/index',
  25 + :nyt => '/local/data/NYT_index',
  26 + :wiki_en => '/local/data/WikiEn_index',
  27 + :wiki_en2012 => '/local/data/WikiEn2012_index',
  28 + :wiki_fr => '/local/data/WikiFr_index',
  29 + :wiki_tc2012 => '/local/data/INEXQA2012index',
  30 + :books => '/local/data/INEX/Books2011/indexedit',
  31 + :ent => '/home/sanjuan/works/nist_eval/csiro_indri.ind'
  32 + }
  33 +
  34 + IndexPathsCaracole = {
  35 + :web_en => '/distant/index_clueweb/disk2/ClueWeb09_English_1_sDocs',
  36 + :web_nospam => '/distant/index_clueweb/disk1/ClueWeb09_English_1noSpam',
  37 + :robust => '/distant/data/Robust04',
  38 + :wt10g => '/distant/index_clueweb/disk3/WT10g_index',
  39 + :gov2 => '/distant/index_clueweb/disk3/GOV2_index'
  40 + }
  41 +
  42 + # #
  43 + # From the SIGKDD 2007 paper : "Exploiting underrepresented query aspects for automatic query expansion"
  44 + def Context.query_aspects q
  45 + query = Mirimiri::Document.new q
  46 +
  47 + 2.upto(query.words.count) do |size|
  48 + query.ngrams(size).each do |s|
  49 + dp = Context.count_w Context::IndexPaths[:wiki_en2012],"#1(#{s})"
  50 + d = Context.count_w Context::IndexPaths[:wiki_en2012],"#{s}",100000000
  51 + p s
  52 +
  53 + denum = s.split.permutation.inject(0.0) do |res,p|
  54 + tmp = (p == s.split) ? 0 : Context.count_w(Context::IndexPaths[:wiki_en2012],"#1(#{p.join(" ")})")
  55 + res + tmp
  56 + end
  57 +
  58 + existence = dp.to_f/d
  59 + support = dp.to_f/denum
  60 + puts "#{s} ===> #{existence*support}"
  61 + end
  62 + end
  63 + end
  64 +
  65 + # #
  66 + # From the CIKM 2007 paper : "Ranking Very Many Typed Entities on Wikipedia"
  67 + #
  68 + # The ``entities`` parameter is currently an array of strings. Could be moved
  69 + # to an array of Entity objects.
  70 + def Context.entity_web_ranking query,entities,nbdocs=100,index='web_nospam'
  71 + q = Indri::IndriQuery.new({:query => "#combine ( #{query} )", :count => nbdocs},"-trecFormat=true")
  72 + indri_index = Indri::IndriIndex.new IndexPaths[index.to_sym]
  73 + docs = indri_index.runquery(q).force_encoding("ISO-8859-1").encode("UTF-8")
  74 + query_list = docs.split("\n").collect { |p| p.scan(/\d+ Q0 (.+) \d+ -\d+.\d+ .+/).first }.flatten
  75 +
  76 + res = entities.pmap(15) do |e|
  77 + eq = Indri::IndriQuery.new({:query => "#combine ( #{e.gsub(/[^a-zA-Z0-9\s]/,'')} )", :count => nbdocs},"-trecFormat=true")
  78 + edocs = indri_index.runquery(eq).force_encoding("ISO-8859-1").encode("UTF-8")
  79 + e_list = edocs.split("\n").collect { |p| p.scan(/\d+ Q0 (.+) \d+ -\d+.\d+ .+/).first }.flatten
  80 +
  81 + rels = e_list&query_list
  82 +
  83 + ave_p = 1.upto(nbdocs).inject(0.0) do |sum,k|
  84 + p = (e_list.first(k)&rels).count.to_f/k
  85 + rel = rels.include?(e_list[k-1]) ? 1.0 : 0.0
  86 + sum + p*rel
  87 + end
  88 +
  89 + {:name => e, :score => ave_p}
  90 + end
  91 +
  92 + res.sort { |a,b| b[:score] <=> a[:score] }
  93 + end
  94 +
  95 + def Context.query_entities query,nb_docs=10
  96 + sources = ['wiki_en2012']
  97 +# sources = ['wiki_en2012','web_nospam','nyt','gigaword']
  98 +# sources = ['web_fr']
  99 + sc = Hash.new { |h,k| h[k] = 0.0 }
  100 +
  101 + sources.each do |source|
  102 + puts " == source : #{source}"
  103 + c = ConceptModel.new query,source,nb_docs
  104 + p c.query
  105 +
  106 + c.concepts.each do |concept|
  107 + querys = concept.words[0,4].join " "
  108 +
  109 + d1 = Context::label_candidate querys.sequential_dependence_model,'wiki_en'
  110 + d2 = Context::label_candidate querys.sequential_dependence_model,'wiki_en2012'
  111 + d3 = Mirimiri::WikipediaPage.search_wikipedia_titles querys
  112 +
  113 + d1 = [] if d1.nil?
  114 + d2 = [] if d2.nil?
  115 + d3 = [] if d3.nil?
  116 +
  117 + d = d2 & d3
  118 + labels = d.collect { |c| c.downcase.gsub(/[^\w\d]/,' ') }
  119 + p d
  120 +
  121 +
  122 + mins = -10000000
  123 + lab = nil
  124 + scores = labels.collect do |l|
  125 + s = concept.score_label l
  126 + if s > mins
  127 + mins = s
  128 + lab = l
  129 + end
  130 + sc[l] += s*(concept.coherence/c.total_coherence)
  131 + { :label => l, :score => s }
  132 + end
  133 +
  134 + print (concept.coherence/c.total_coherence).to_s+" <= "
  135 + p concept.elements.collect { |c| c.word }
  136 + end
  137 + end
  138 +
  139 + sc.sort { |a,b| b[1] <=> a[1] }
  140 + end
  141 +
  142 + def Context.label_candidate query,index,nb_candidates,rm3=false
  143 +# Mirimiri::WikipediaPage.search_wikipedia_titles query
  144 + args = rm3 ? "-fbDocs=20 -fbTerms=30 -fbMu=2000 -fbOrigWeight=0.7" : ""
  145 + q = Indri::IndriQuery.new({:query => query, :count => nb_candidates},"-printDocuments=true -trecFormat=true #{args}")
  146 + index = Indri::IndriIndex.new IndexPaths[index.to_sym]
  147 + docs = index.runquery q
  148 + docs = docs.force_encoding("ISO-8859-1").encode("UTF-8") if ['web_fr','web_en','web_nospam'].include? index
  149 + idocs = Indri::IndriPrintedDocuments.new(docs)
  150 +
  151 + wiki_titles = idocs.extract_docs.collect do |d|
  152 + t = Nokogiri::HTML d
  153 + t.xpath('//title').text
  154 + end
  155 +
  156 + wiki_titles
  157 + end
  158 +
  159 + def Context.lcm query
  160 + source = 'nyt'
  161 +
  162 + a = Time.now
  163 + qc = QueryContext.new(1.upto(20).collect do |nb_docs|
  164 + beg = Time.now
  165 + c = ConceptModel.new query,source,nb_docs
  166 + puts "#{nb_docs} ==> Time elapsed: #{Time.now-beg} seconds"
  167 + c
  168 + end)
  169 + puts "All concepts : #{Time.now-a} seconds"
  170 +
  171 + model = qc.best_concept_model
  172 + puts "Total : #{Time.now-a} seconds"
  173 + model
  174 + end
  175 +
  176 + def Context.term_context index_path,query,size,num_page,args={}
  177 + terms = self.term_concepts index_path,query,size,num_page,args
  178 + args[:window] ||= 1
  179 +
  180 +# context = "#weight ( #{terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| "#{e[0]} #1(#{e[1]})" }.join " "} ) " unless terms.empty?
  181 + context = "#weight ( #{terms.collect { |c| "#{"%.10f" % c[:score]} #uw#{args[:window]}(#{c[:concept]})" }.join " "} ) " unless terms.empty?
  182 +
  183 + context
  184 + end
  185 +
  186 +# From SIGIR'06 paper : `Improving the estimation of relevance models using large external corpora`
  187 +#
  188 + def Context.morm index_path,query,size,num_page
  189 + docs,scores,names = self.feedback_docs index_path,query,num_page
  190 +
  191 + terms = []
  192 +
  193 + docs.each do |d|
  194 + r = Mirimiri::Document.new d
  195 + tmp = self.extract_ngrams r,:tf,1
  196 + terms += tmp.compact.collect { |t| [t[0]*Math.exp(scores[docs.index(d)].to_f),t[1]] }
  197 + end
  198 +
  199 + final = terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| { :score => e[0], :concept => e[1] } }
  200 + context = "#weight ( #{final.collect { |c| "#{"%.10f" % c[:score]} #1(#{c[:concept]})" }.join " "} ) " unless terms.empty?
  201 +
  202 + context
  203 + end
  204 +
  205 + def Context.term_concepts index_path,query,size,num_page,args={}
  206 + args[:func] ||= :entropy
  207 + args[:window] ||= 1
  208 +
  209 + docs = self.feedback_docs index_path,query,num_page
  210 +
  211 + resource = Mirimiri::Document.new docs.join(' ')
  212 + terms = self.extract_ngrams resource,args[:func].to_sym,args[:window]
  213 +
  214 + terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| { :score => e[0], :concept => e[1] } }
  215 + end
  216 +
  217 +
  218 + def Context.sentence_similarity s1,s2,index_path
  219 + q = s1.is_a?(String) ? s1.split : s1
  220 + r = s2.is_a?(String) ? s2.split : s2
  221 +
  222 + inter = q & r
  223 +
  224 + s = (inter.count/q.count.to_f) * inter.inject(0.0) { |sum,w| sum + Math.log(Context.df index_path,w) }
  225 + s
  226 + end
  227 +
  228 +
  229 + private
  230 +
  231 + def Context.df index_path,w,window=1
  232 + if @@count[index_path]["total#{window}"].nil?
  233 + total = `dumpindex #{index_path} s`.match(/total terms:\t(.*)/)[1].to_f-(window-1).to_f
  234 + @@semaphore.synchronize {
  235 + @@count[index_path]["total#{window}"] = total
  236 + }
  237 + end
  238 +
  239 + if @@df[index_path]["#uw#{window}(#{w})"].nil?
  240 + nb = `dumpindex #{index_path} e "#uw#{window}(#{w})" | awk ' { arr[$1]=$0 } END { for ( key in arr ) { print arr[key] } } ' | wc -l`.chomp.split(':').last.to_f - 1
  241 + @@semaphore.synchronize {
  242 + @@df[index_path]["#uw#{window}(#{w})"] = nb+1.0
  243 + }
  244 + end
  245 + begin
  246 + d = @@count[index_path]["total#{window}"]/@@df[index_path]["#uw#{window}(#{w})"]
  247 + rescue
  248 + puts w
  249 + exit
  250 + end
  251 + d
  252 + end
  253 +
  254 + def Context.prob_w index_path,w,window=1
  255 + if @@count[index_path]["total#{window}"].nil?
  256 + total = `dumpindex #{index_path} s`.match(/total terms:\t(.*)/)[1].to_f-(window-1).to_f
  257 + @@semaphore.synchronize {
  258 + @@count[index_path]["total#{window}"] = total+1.0
  259 + }
  260 + end
  261 +
  262 + nb = self.count_w index_path,w,window
  263 + nb/@@count[index_path]["total#{window}"]
  264 + end
  265 +
  266 + def Context.count_w index_path,w,window=1
  267 + if @@count[index_path]["##{window}(#{w})"].nil?
  268 + nb = `dumpindex #{index_path} x "##{window}(#{w})"`.chomp.split(':').last.to_f
  269 + @@semaphore.synchronize {
  270 + @@count[index_path]["##{window}(#{w})"] = ("%.15f" % nb).to_f+1.0
  271 + }
  272 + end
  273 + @@count[index_path]["##{window}(#{w})"]
  274 + end
  275 +
  276 +
  277 + public
  278 + def Context.extract_ngrams resource,func,n
  279 + raw_terms = 1.upto(n).collect { |i| resource.ngrams(i) }.flatten
  280 +# raw_terms = resource.ngrams(n).flatten
  281 + terms = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.any? { |e| e.length <= 2 } || w.split.any? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") || (Mirimiri::Stoplist&w.unaccent.split).count >= 2 }
  282 +# terms = raw_terms.uniq.collect { |w| w=w.gsub(/\W/,' ').strip; [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.any? { |e| e.length <= 2 } || w.split.any? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") || (Mirimiri::Stoplist&w.unaccent.split).count >= 1 }
  283 +# terms = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.all? { |e| e.length <= 1 } || w.split.all? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") }
  284 + terms
  285 + end
  286 +
  287 + def Context.feedback_docs index_path,query,num_page
  288 + query = Indri::IndriQuery.new({:query => query, :count => num_page},"-printDocuments=true -trecFormat=true")
  289 + index = Indri::IndriIndex.new index_path
  290 + idocs = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))
  291 +
  292 + texts,scores,names = idocs.extract_docs_score
  293 +
  294 + docs = texts.collect do |idoc|
  295 + begin
  296 + Sanitize.clean idoc,:remove_contents => ['script','style']
  297 + rescue
  298 + d = Nokogiri::HTML(idoc)
  299 + d.xpath('//text()').text
  300 + end
  301 + end
  302 +
  303 + return docs,scores,names
  304 + end
  305 +
  306 +end
lib/context/concept.rb
  1 +#!/usr/bin/env ruby
  2 +
  3 +class Concept
  4 + attr_reader :elements, :coherence
  5 +
  6 + def initialize
  7 + @elements = []
  8 + @coherence = 0
  9 + end
  10 +
  11 + def <<(elem)
  12 + raise ArgumentError, 'Argument must be a ConceptualElement.' unless elem.is_a? ConceptualElement
  13 + @elements << elem
  14 + end
  15 +
  16 + def compute_coherence scores,theta,k#arg=nil
  17 +# update_feedback_coherence arg
  18 + @coherence = 0.upto(theta.count-1).inject(0.0) do |sum,i|
  19 + sum + Math.exp(theta[i][k])*Math.exp(scores[i].to_f)
  20 + end
  21 + end
  22 +
  23 + def score_label label,index_path=Context::IndexPaths[:wiki_en2012]
  24 + s = @elements.inject(0.0) do |res,e|
  25 +# *self.prob_w(index_path,"#{w[:word]} #uw10(#{label})")
  26 + res + e.prob*Math.log(Context.prob_w(index_path,"#{e.word} #uw10(#{label})")/(e.p_in_coll*Context.prob_w(index_path,label)))
  27 + end
  28 +
  29 + s
  30 + end
  31 +
  32 + def concept_words_similarity s,index_path=Context::IndexPaths[:wiki_en]
  33 +# inter = @elements.collect { |w| w unless (w.word & s).empty? }
  34 + inter = self.words & s.words
  35 +
  36 + sim = (inter.count/self.words.count.to_f) * inter.inject(0.0) { |sum,w| sum + Math.log(Context.df index_path,w) }
  37 + sim
  38 + end
  39 +
  40 + def weighted_concept_similarity s,index_path=Context::IndexPaths[:wiki_en]
  41 + inter = self.words & s.words
  42 + sim = (inter.count/self.words.count.to_f)
  43 + sim *= @elements.inject(0.0) do |sum,w|
  44 + wp = s.get_element_from_word w.word
  45 +
  46 + s.words.include?(w.word) ? sum + wp.prob*w.prob*Math.log(Context.df index_path,w.word) : sum + 0.0
  47 + end
  48 + sim
  49 + end
  50 +
  51 + def get_element_from_word w
  52 + @elements.select { |e| e.word == w }.first
  53 + end
  54 +
  55 + def words
  56 + @elements.collect { |w| w.word }
  57 + end
  58 +
  59 + def word_probs
  60 + res = {}
  61 + @elements.each { |w| res[w.word] = w.prob }
  62 + res
  63 + end
  64 +
  65 +# From papers :
  66 +# NAACL'10: `Automatic Evaluation of Topic Coherence`
  67 +# EMNLP'12: `Exploring Topic Coherence over many models and many topics`
  68 +#
  69 + def uci_coherence epsilon=1,index_path=Context::IndexPaths[:wiki_en]
  70 + coherence = @elements.combination(2).inject(0.0) do |res,bigram|
  71 +#Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)*
  72 + t = (Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)+epsilon)/((bigram.first.p_in_coll)*(bigram.last.p_in_coll))
  73 + res + Math.log(t)
  74 + end
  75 +
  76 + coherence /= @elements.count*(@elements.count-1)
  77 + coherence
  78 + end
  79 +
  80 + def uci_coherence_entropy index_path=Context::IndexPaths[:wiki_en]
  81 + coherence = @elements.combination(2).inject(0.0) do |res,bigram|
  82 +#Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)*
  83 + t = (Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20))/((bigram.first.p_in_coll)*(bigram.last.p_in_coll))
  84 + res + t
  85 + end
  86 +
  87 + coherence /= @elements.count*(@elements.count-1)
  88 + coherence
  89 + end
  90 +
  91 + protected
  92 + def update_coherence index_path=Context::IndexPaths[:wiki_en]
  93 + coherence = @elements.combination(2).inject(0.0) do |res,bigram|
  94 + res + Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}")*Math.log(Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}")/((bigram.first.p_in_coll index_path)*(bigram.last.p_in_coll index_path)))
  95 + end
  96 +
  97 + coherence /= @elements.count*(@elements.count-1)
  98 + @coherence = coherence
  99 + end
  100 +
  101 + def update_feedback_coherence documents
  102 + corpus = Mirimiri::Document.new documents.join " "
  103 +
  104 + windows = corpus.ngrams(10).collect { |w| w.split }
  105 +
  106 + coherence = @elements.combination(2).inject(0.0) do |res,bigram|
  107 + big_prob = windows.count{ |c| c.include?(bigram.first.word) && c.include?(bigram.last.word) }.to_f/windows.count
  108 + mi = big_prob.zero? ? 0.0 : big_prob*bigram.first.prob*bigram.last.prob*Math.log(big_prob/(corpus.tf(bigram.first.word)*corpus.tf(bigram.last.word)))
  109 + res + mi
  110 + end
  111 +
  112 + coherence /= @elements.count*(@elements.count-1)
  113 + @coherence = coherence
  114 + end
  115 +
  116 +end
lib/context/concept_model.rb
  1 +#!/usr/bin/env ruby
  2 +
  3 +require 'lda-ruby'
  4 +require 'peach'
  5 +
  6 +class ConceptModel
  7 + attr_reader :concepts,:documents,:source,:nbdocs,:nbterms,:query,:total_coherence,:doc_scores,:doc_names,:theta,:entropy_coherence,:avg_coherence
  8 +
  9 + def ConceptModel.parse_hdp str
  10 + concepts = []
  11 + eval(str).each do |hdp_top|
  12 + c = Concept.new
  13 + hdp_top.gsub(/topic \d: /,'').split(" + ").each do |words|
  14 + ee = words.split('*')
  15 + begin
  16 + e = ConceptualElement.new ee[1],ee[0].to_f
  17 + c << e
  18 + rescue ArgumentError
  19 + next
  20 + end
  21 + end
  22 +
  23 + concepts << c
  24 + end
  25 + concepts
  26 + end
  27 +
  28 + def initialize query,source,nb_docs,nb_terms=10,k=false
  29 + raise ArgumentError, 'Argument 1 must be a String.' unless query.is_a? String
  30 + raise ArgumentError, 'Argument 2 must be a valid Index key.' unless Context::IndexPaths.has_key?(source.to_sym)
  31 +
  32 + @source = source.to_sym
  33 + @nbdocs = nb_docs
  34 + @nbterms = nb_terms
  35 + @query = query
  36 + @concepts = []
  37 + @total_coherence = 0.0
  38 +
  39 + corpus = Lda::Corpus.new
  40 +
  41 + @documents,@doc_scores,@doc_names = Context.feedback_docs Context::IndexPaths[@source],@query,@nbdocs
  42 + @documents.each do |d|
  43 + doc = Lda::TextDocument.new corpus,d
  44 + corpus.add_document doc
  45 + end
  46 +
  47 + if k == false
  48 + num_topics = topic_divergence corpus
  49 + else
  50 + num_topics = k
  51 + end
  52 +
  53 + lda = Lda::Lda.new corpus
  54 + lda.verbose=false
  55 + lda.num_topics = num_topics
  56 +
  57 + lda.em('random')
  58 +
  59 + @beta = lda.beta # to avoid repeated expensive computation
  60 + @vocab = lda.vocab #
  61 +
  62 + @theta = lda.compute_topic_document_probability
  63 +
  64 +# Normalizing the phi_t(w) weights for each topic
  65 +#
  66 + total_prob = {}
  67 + tmp_top_word_indices(@nbterms,@vocab,@beta).each_pair do |topic,indices|
  68 + total_prob[topic] = indices.inject(0.0) { |res,i| res + Math.exp(@beta[topic][i].to_f) }
  69 + end
  70 +
  71 + tmp_top_word_indices(@nbterms,@vocab,@beta).each_pair do |topic,indices|
  72 + c = Concept.new
  73 + indices.each do |i|
  74 + begin
  75 + e = ConceptualElement.new @vocab[i],(Math.exp(@beta[topic][i].to_f)/total_prob[topic])
  76 + c << e
  77 + rescue ArgumentError
  78 + next
  79 + end
  80 + end
  81 +
  82 + c.compute_coherence @doc_scores,@theta,topic
  83 +
  84 +# c.compute_coherence @doc_scores,gamma_m,topic # takes time since it has to compute several probabilities
  85 + @concepts << c
  86 + @total_coherence += c.coherence
  87 + end
  88 + end
  89 +
  90 + def to_s
  91 + @concepts.collect do |c|
  92 + "#{c.coherence/@total_coherence} => [#{c.elements.collect do |e|
  93 + "#{e.prob} #{e.word}"
  94 + end.join(', ')
  95 + }]"
  96 + end.join "\n"
  97 + end
  98 +
  99 + def to_indriq
  100 + "#weight( #{@concepts.collect do |c|
  101 + "#{c.coherence/@total_coherence} #weight ( #{c.elements.collect do |e|
  102 + "#{e.prob} #{e.word}"
  103 + end.join(' ')
  104 + } ) "
  105 + end.join " "} )"
  106 + end
  107 +
  108 + def <<(concept)
  109 + raise ArgumentError, 'Argument must be a Concept.' unless elem.is_a? Concept
  110 + @concepts << concept
  111 + end
  112 +
  113 + def avg_model_coherence
  114 + if @documents.empty?
  115 + @avg_coherence = 0.0
  116 + else
  117 + @avg_coherence = @concepts.inject(0.0) { |res,c| res + c.uci_coherence }/@concepts.count #if @avg_coherence.nil?
  118 + end
  119 + @avg_coherence
  120 + end
  121 +
  122 + def entropy_model_coherence
  123 + if @documents.empty?
  124 + @entropy_coherence = 0.0
  125 + else
  126 + @entropy_coherence = @concepts.inject(0.0) do |res,c|
  127 + ent = c.uci_coherence_entropy
  128 + ent += 0.0000000000000000000000001 if ent.zero?
  129 + res + ent*Math.log(ent)
  130 + end #if @entropy_coherence.nil?
  131 + end
  132 + @entropy_coherence
  133 + end
  134 +
  135 + private
  136 + def topic_divergence corpus
  137 + max_kl = 0.0
  138 +# Old trick to limit number of iterations
  139 +# num_p = @nbdocs < 6 ? @nbdocs + 5 : @nbdocs
  140 +
  141 + semaphore = Mutex.new
  142 +
  143 + 1.upto(20).inject do |k,ntop|
  144 +# 1.upto(num_p).inject do |k,ntop|
  145 + lda = Lda::Lda.new corpus
  146 + lda.verbose=false
  147 + lda.num_topics = ntop
  148 + lda.em('random')
  149 + beta_m = lda.beta # to avoid repeated expensive computation
  150 + vocab = lda.vocab
  151 +
  152 + topics_i = Array.new(ntop) { |i| i }
  153 +
  154 + sum_kl = topics_i.combination(2).inject(0.0) do |kl,topics|
  155 + ti = topics.first
  156 + tj = topics.last
  157 + begin
  158 + kl + 0.upto(vocab.count-1).inject(0.0) do |res,w_i|
  159 + res + ( Math.exp(beta_m[ti][w_i])*Math.log(Math.exp(beta_m[ti][w_i])/Math.exp(beta_m[tj][w_i])) + Math.exp(beta_m[tj][w_i])*Math.log(Math.exp(beta_m[tj][w_i])/Math.exp(beta_m[ti][w_i])) )
  160 + end
  161 + rescue
  162 + kl + 0.0
  163 + end
  164 + end
  165 +
  166 + sum_kl /= ntop*(ntop-1)
  167 + sum_kl = max_kl if sum_kl.nan? || sum_kl.infinite?
  168 +
  169 + sum_kl <= max_kl ? k : (max_kl = sum_kl and ntop)
  170 + end
  171 + end
  172 +
  173 + def tmp_top_word_indices(words_per_topic = 10,vocab,beta)
  174 + raise 'No vocabulary loaded.' unless vocab
  175 +
  176 + # find the highest scoring words per topic
  177 + topics = Hash.new
  178 + indices = (0...vocab.size).to_a
  179 +
  180 + beta.each_with_index do |topic, topic_num|
  181 + topics[topic_num] = (topic.zip((0...vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
  182 + end
  183 +
  184 + topics
  185 + end
  186 +
  187 +end
lib/context/conceptual_element.rb
  1 +#!/usr/bin/env ruby
  2 +
  3 +class ConceptualElement
  4 + attr_reader :word, :prob
  5 +
  6 + def initialize w,s
  7 + raise ArgumentError, 'Argument 1 must be a String.' unless w.is_a? String
  8 + raise ArgumentError, 'Argument 2 must be a Float.' unless s.is_a? Float
  9 +
  10 + tmp = w.gsub(/(-)\1+/,'-').gsub(/([^\w-].*|^-|-$)/,'')
  11 + raise ArgumentError, 'Arguments 1 is not a useful word ! ;)' if tmp.is_stopword? || tmp.size < 2
  12 +
  13 + @word = tmp
  14 + @prob = s
  15 + end
  16 +
  17 + def p_in_coll index_path=Context::IndexPaths[:wiki_en],size=20
  18 + Context.prob_w index_path,@word,size
  19 + end
  20 +end
lib/context/query_context.rb
  1 +#!/usr/bin/env ruby
  2 +
  3 +class QueryContext < Array
  4 +
  5 + def best_concept_model
  6 + max_sim = 0.0
  7 + best = nil
  8 +
  9 + for p in 0...self.count
  10 + sim = 0.0
  11 + for pp in 0...self.count
  12 + next if pp == p
  13 + combs = self.at(p).concepts.product self.at(pp).concepts
  14 + sum_sim = combs.inject(0.0) { |sum,k| sum + k.first.weighted_concept_similarity(k.last) }
  15 + sim += sum_sim/combs.count
  16 + end
  17 +
  18 +
  19 + if sim > max_sim
  20 + max_sim = sim
  21 + best = p
  22 + end
  23 + end
  24 +
  25 + best.nil? ? nil : self.at(best)
  26 + end
  27 +
  28 + def best_concept_model_word
  29 + max_sim = 0.0
  30 + best = nil
  31 +
  32 + for p in 0...self.count
  33 + sim = 0.0
  34 + for pp in 0...self.count
  35 + next if pp == p
  36 + combs = self.at(p).concepts.product self.at(pp).concepts
  37 + sum_sim = combs.inject(0.0) { |sum,k| sum + k.first.concept_words_similarity(k.last) }
  38 + sim += sum_sim/combs.count
  39 + end
  40 +
  41 +
  42 + if sim > max_sim
  43 + max_sim = sim
  44 + best = p
  45 + end
  46 + end
  47 +
  48 + best.nil? ? nil : self.at(best)
  49 + end
  50 +end