changes made for the oair paper

Romain Deveaud
1 parent e55598814a
Showing 7 changed files with 681 additions and 63 deletions Side-by-side Diff
README.markdown
context.rb
lib/context.rb
lib/context/concept.rb
lib/context/concept_model.rb
lib/context/conceptual_element.rb
lib/context/query_context.rb
@@ -5,6 +5,8 @@
 Stand-alone functions built on top of mirimiri and lda-ruby.
 Aiming to extract contextual features from general corpora related to search scenario.
  
+This code was used to produce the results reported in our OAIR'13 paper : "Unsupervised Latent Concept Modeling to Identify Query Facets".
+
 License
 =======
  
-#!/usr/bin/env ruby
-
-require 'mirimiri'
-require 'sanitize'
-require 'lda-ruby'
-
-module Context
-  IndexPaths = {
-    :web_en        => '/mnt/disk2/ClueWeb09_English_1_sDocs',
-    :web_fr        => '/mnt/disk2/ClueWeb09_French_1_sDocs',
-    :web_nospam    => '/mnt/disk1/ClueWeb09_English_1noSpam',
-    :gigaword      => '/local/data/GigaWord/index',
-    :nyt           => '/local/data/NYT_index',
-    :wiki_en       => '/local/data/WikiEn_index',
-    :wiki_fr       => '/local/data/WikiFr_index'
-  }
-
-  def Context.term_context index_path,query,size,num_page,args={}
-    args[:func]   ||= :entropy
-    args[:window] ||= 1
-
-    docs     = self.feedback_docs  index_path,query,num_page
-
-    resource = Mirimiri::Document.new docs.join(' ')
-    terms    = self.extract_ngrams resource,args[:func].to_sym,args[:window]
-
-    context = "#weight ( #{terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| "#{e[0]} #1(#{e[1]})" }.join " "} ) " unless terms.empty?
-
-    context
-  end
-
-  def Context.topic_context index_path,query,size,num_page,args={}
-    corpus = Lda::Corpus.new
-
-    docs   = self.feedback_docs index_path,query,num_page
-    docs.each do |d| 
-      doc = Lda::TextDocument.new corpus,d
-      corpus.add_document doc
-    end
-
-    lda = Lda::Lda.new corpus
-    lda.num_topics = num_page/10
-    lda.em 'random'
-    puts lda.top_words(size)
-  end
-
-  private
-  def Context.feedback_docs index_path,query,num_page
-    query = Indri::IndriQuery.new({:query => query, :count => num_page},"-printDocuments=true -trecFormat=true")
-    index = Indri::IndriIndex.new index_path
-    idocs = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))
-
-    docs = idocs.extract_docs.collect { |idoc| Sanitize.clean idoc,:remove_contents => ['script']  }
-    docs
-  end
-
-  def Context.extract_ngrams resource,func,n
-    raw_terms = 1.upto(n).collect      { |i| resource.ngrams(i) }.flatten
-    terms     = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.all? { |e| e.length <= 1 } || w.split.all? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") }
-    terms
-  end
-
-end
+#!/usr/bin/env ruby
+
+require 'mirimiri'
+require 'sanitize'
+require 'lda-ruby'
+require 'context/conceptual_element'
+require 'context/concept_model'
+require 'context/concept'
+require 'context/query_context'
+
+
+module Context
+  @@count = Hash.new { |h,k| h[k] = {} }
+  @@df    = Hash.new { |h,k| h[k] = {} }
+  @@semaphore = Mutex.new
+
+  IndexPaths = {
+    :web_en        => '/mnt/disk2/ClueWeb09_English_1_sDocs',
+    :web_fr        => '/mnt/disk2/ClueWeb09_French_1_sDocs',
+    :web_nospam    => '/mnt/disk1/ClueWeb09_English_1noSpam',
+    :robust        => '/mnt/disk5/Robust04/',
+    :wt10g         => '/mnt/disk3/WT10g_index',
+    :gov2          => '/mnt/disk3/GOV2_index',
+    :gigaword      => '/local/data/GigaWord/index',
+    :nyt           => '/local/data/NYT_index',
+    :wiki_en       => '/local/data/WikiEn_index',
+    :wiki_en2012   => '/local/data/WikiEn2012_index',
+    :wiki_fr       => '/local/data/WikiFr_index',
+    :wiki_tc2012   => '/local/data/INEXQA2012index',
+    :books         => '/local/data/INEX/Books2011/indexedit',
+    :ent           => '/home/sanjuan/works/nist_eval/csiro_indri.ind'
+  }
+
+  IndexPathsCaracole = {
+    :web_en        => '/distant/index_clueweb/disk2/ClueWeb09_English_1_sDocs',
+    :web_nospam    => '/distant/index_clueweb/disk1/ClueWeb09_English_1noSpam',
+    :robust        => '/distant/data/Robust04',
+    :wt10g         => '/distant/index_clueweb/disk3/WT10g_index',
+    :gov2          => '/distant/index_clueweb/disk3/GOV2_index'
+  }
+
+  # #
+  # From the SIGKDD 2007 paper : "Exploiting underrepresented query aspects for automatic query expansion"
+  def Context.query_aspects q
+    query = Mirimiri::Document.new q
+
+    2.upto(query.words.count) do |size|
+      query.ngrams(size).each do |s|
+        dp = Context.count_w Context::IndexPaths[:wiki_en2012],"#1(#{s})"
+        d  = Context.count_w Context::IndexPaths[:wiki_en2012],"#{s}",100000000 
+        p s
+
+        denum = s.split.permutation.inject(0.0) do |res,p|
+          tmp = (p == s.split) ? 0 : Context.count_w(Context::IndexPaths[:wiki_en2012],"#1(#{p.join(" ")})")
+          res + tmp
+        end
+
+        existence = dp.to_f/d
+        support = dp.to_f/denum
+        puts "#{s} ===> #{existence*support}"
+      end
+    end
+  end
+
+  # #
+  # From the CIKM 2007 paper : "Ranking Very Many Typed Entities on Wikipedia"
+  #
+  # The ``entities`` parameter is currently an array of strings. Could be moved
+  # to an array of Entity objects.
+  def Context.entity_web_ranking query,entities,nbdocs=100,index='web_nospam'
+    q = Indri::IndriQuery.new({:query => "#combine ( #{query} )", :count => nbdocs},"-trecFormat=true")
+    indri_index = Indri::IndriIndex.new IndexPaths[index.to_sym]
+    docs = indri_index.runquery(q).force_encoding("ISO-8859-1").encode("UTF-8")
+    query_list = docs.split("\n").collect { |p| p.scan(/\d+ Q0 (.+) \d+ -\d+.\d+ .+/).first }.flatten
+
+    res = entities.pmap(15) do |e|
+      eq = Indri::IndriQuery.new({:query => "#combine ( #{e.gsub(/[^a-zA-Z0-9\s]/,'')} )", :count => nbdocs},"-trecFormat=true")
+      edocs = indri_index.runquery(eq).force_encoding("ISO-8859-1").encode("UTF-8") 
+      e_list = edocs.split("\n").collect { |p| p.scan(/\d+ Q0 (.+) \d+ -\d+.\d+ .+/).first }.flatten
+
+      rels = e_list&query_list
+
+      ave_p = 1.upto(nbdocs).inject(0.0) do |sum,k| 
+        p = (e_list.first(k)&rels).count.to_f/k
+        rel = rels.include?(e_list[k-1]) ? 1.0 : 0.0
+        sum + p*rel
+      end
+
+      {:name => e, :score => ave_p}
+    end
+
+    res.sort { |a,b| b[:score] <=> a[:score] }
+  end
+
+  def Context.query_entities query,nb_docs=10
+    sources = ['wiki_en2012']
+#    sources = ['wiki_en2012','web_nospam','nyt','gigaword']
+#    sources = ['web_fr']
+    sc = Hash.new { |h,k| h[k] = 0.0 }
+
+    sources.each do |source|
+      puts " == source : #{source}"
+      c = ConceptModel.new query,source,nb_docs
+      p c.query
+
+      c.concepts.each do |concept|
+        querys = concept.words[0,4].join " "
+
+        d1 = Context::label_candidate querys.sequential_dependence_model,'wiki_en'
+        d2 = Context::label_candidate querys.sequential_dependence_model,'wiki_en2012'
+        d3 = Mirimiri::WikipediaPage.search_wikipedia_titles querys
+
+        d1 = [] if d1.nil?
+        d2 = [] if d2.nil?
+        d3 = [] if d3.nil?
+
+        d =  d2 & d3
+        labels = d.collect { |c| c.downcase.gsub(/[^\w\d]/,' ') }
+        p d
+
+
+        mins = -10000000
+        lab = nil
+        scores = labels.collect do |l|
+          s = concept.score_label l 
+          if s > mins
+            mins = s
+            lab = l
+          end
+          sc[l] += s*(concept.coherence/c.total_coherence)
+          { :label => l, :score => s }
+        end
+
+        print (concept.coherence/c.total_coherence).to_s+" <= "
+        p concept.elements.collect { |c| c.word }
+      end
+    end
+
+    sc.sort { |a,b| b[1] <=> a[1] }
+  end
+
+  def Context.label_candidate query,index,nb_candidates,rm3=false
+#    Mirimiri::WikipediaPage.search_wikipedia_titles query  
+    args = rm3 ? "-fbDocs=20 -fbTerms=30 -fbMu=2000 -fbOrigWeight=0.7" : ""
+    q = Indri::IndriQuery.new({:query => query, :count => nb_candidates},"-printDocuments=true -trecFormat=true #{args}")
+    index = Indri::IndriIndex.new IndexPaths[index.to_sym]
+    docs = index.runquery q
+    docs = docs.force_encoding("ISO-8859-1").encode("UTF-8") if ['web_fr','web_en','web_nospam'].include? index
+    idocs = Indri::IndriPrintedDocuments.new(docs)
+
+    wiki_titles = idocs.extract_docs.collect do |d|
+      t = Nokogiri::HTML d
+      t.xpath('//title').text
+    end
+
+    wiki_titles
+  end
+
+  def Context.lcm query
+    source = 'nyt'
+
+    a = Time.now
+    qc = QueryContext.new(1.upto(20).collect do |nb_docs|
+      beg = Time.now
+      c = ConceptModel.new query,source,nb_docs
+      puts "#{nb_docs} ==> Time elapsed: #{Time.now-beg} seconds" 
+      c
+    end)
+    puts "All concepts : #{Time.now-a} seconds"
+
+    model = qc.best_concept_model
+    puts "Total : #{Time.now-a} seconds"
+    model
+  end
+
+  def Context.term_context index_path,query,size,num_page,args={}
+    terms    = self.term_concepts index_path,query,size,num_page,args
+    args[:window] ||= 1
+
+#    context = "#weight ( #{terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| "#{e[0]} #1(#{e[1]})" }.join " "} ) " unless terms.empty?
+    context = "#weight ( #{terms.collect { |c| "#{"%.10f" % c[:score]} #uw#{args[:window]}(#{c[:concept]})" }.join " "} ) " unless terms.empty?
+
+    context
+  end
+
+# From SIGIR'06 paper : `Improving the estimation of relevance models using large external corpora`
+#
+  def Context.morm index_path,query,size,num_page
+    docs,scores,names = self.feedback_docs  index_path,query,num_page
+
+    terms = []
+
+    docs.each do |d|
+      r = Mirimiri::Document.new d
+      tmp = self.extract_ngrams r,:tf,1
+      terms += tmp.compact.collect { |t| [t[0]*Math.exp(scores[docs.index(d)].to_f),t[1]] }
+    end
+
+    final = terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| { :score => e[0], :concept => e[1] } }
+    context = "#weight ( #{final.collect { |c| "#{"%.10f" % c[:score]} #1(#{c[:concept]})" }.join " "} ) " unless terms.empty?
+
+    context
+  end
+
+  def Context.term_concepts index_path,query,size,num_page,args={}
+    args[:func]   ||= :entropy
+    args[:window] ||= 1
+
+    docs     = self.feedback_docs  index_path,query,num_page
+
+    resource = Mirimiri::Document.new docs.join(' ')
+    terms    = self.extract_ngrams resource,args[:func].to_sym,args[:window]
+
+    terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| { :score => e[0], :concept => e[1] } }
+  end
+
+
+  def Context.sentence_similarity s1,s2,index_path
+    q = s1.is_a?(String) ? s1.split : s1
+    r = s2.is_a?(String) ? s2.split : s2
+
+    inter = q & r
+
+    s = (inter.count/q.count.to_f) * inter.inject(0.0) { |sum,w| sum + Math.log(Context.df index_path,w) }
+    s
+  end
+
+
+  private
+
+  def Context.df index_path,w,window=1
+    if @@count[index_path]["total#{window}"].nil?
+      total = `dumpindex #{index_path} s`.match(/total terms:\t(.*)/)[1].to_f-(window-1).to_f
+      @@semaphore.synchronize {
+        @@count[index_path]["total#{window}"] = total
+      }
+    end
+
+    if @@df[index_path]["#uw#{window}(#{w})"].nil?
+      nb = `dumpindex #{index_path} e "#uw#{window}(#{w})" | awk ' { arr[$1]=$0 } END { for ( key in arr ) { print arr[key] } } ' | wc -l`.chomp.split(':').last.to_f - 1
+      @@semaphore.synchronize {
+        @@df[index_path]["#uw#{window}(#{w})"] = nb+1.0 
+      }
+    end
+    begin
+    d = @@count[index_path]["total#{window}"]/@@df[index_path]["#uw#{window}(#{w})"]
+    rescue
+      puts w
+    exit
+    end
+    d
+  end
+
+  def Context.prob_w index_path,w,window=1
+    if @@count[index_path]["total#{window}"].nil?
+      total = `dumpindex #{index_path} s`.match(/total terms:\t(.*)/)[1].to_f-(window-1).to_f
+      @@semaphore.synchronize {
+        @@count[index_path]["total#{window}"] = total+1.0
+      }
+    end
+
+    nb = self.count_w index_path,w,window
+    nb/@@count[index_path]["total#{window}"]
+  end
+
+  def Context.count_w index_path,w,window=1
+    if @@count[index_path]["##{window}(#{w})"].nil?
+      nb = `dumpindex #{index_path} x "##{window}(#{w})"`.chomp.split(':').last.to_f 
+      @@semaphore.synchronize {
+        @@count[index_path]["##{window}(#{w})"] = ("%.15f" % nb).to_f+1.0
+      }
+    end
+    @@count[index_path]["##{window}(#{w})"]
+  end
+
+
+  public
+  def Context.extract_ngrams resource,func,n
+    raw_terms = 1.upto(n).collect      { |i| resource.ngrams(i) }.flatten
+#    raw_terms = resource.ngrams(n).flatten
+    terms     = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.any? { |e| e.length <= 2 } || w.split.any? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") || (Mirimiri::Stoplist&w.unaccent.split).count >= 2 }
+#    terms     = raw_terms.uniq.collect { |w| w=w.gsub(/\W/,' ').strip; [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.any? { |e| e.length <= 2 } || w.split.any? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") || (Mirimiri::Stoplist&w.unaccent.split).count >= 1 }
+#    terms     = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.all? { |e| e.length <= 1 } || w.split.all? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") }
+    terms
+  end
+
+  def Context.feedback_docs index_path,query,num_page
+    query = Indri::IndriQuery.new({:query => query, :count => num_page},"-printDocuments=true -trecFormat=true")
+    index = Indri::IndriIndex.new index_path
+    idocs = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))
+
+    texts,scores,names = idocs.extract_docs_score
+
+    docs = texts.collect do |idoc| 
+      begin
+        Sanitize.clean idoc,:remove_contents => ['script','style']
+      rescue
+        d = Nokogiri::HTML(idoc)
+        d.xpath('//text()').text
+      end
+    end
+
+    return docs,scores,names
+  end
+
+end
+#!/usr/bin/env ruby
+
+class Concept
+  attr_reader :elements, :coherence
+  
+  def initialize
+    @elements = []
+    @coherence = 0
+  end
+
+  def <<(elem)
+    raise ArgumentError, 'Argument must be a ConceptualElement.' unless elem.is_a? ConceptualElement
+    @elements << elem
+  end
+
+  def compute_coherence scores,theta,k#arg=nil
+#    update_feedback_coherence arg
+    @coherence = 0.upto(theta.count-1).inject(0.0) do |sum,i|
+      sum + Math.exp(theta[i][k])*Math.exp(scores[i].to_f)
+    end
+  end
+
+  def score_label label,index_path=Context::IndexPaths[:wiki_en2012]
+    s = @elements.inject(0.0) do |res,e|
+#      *self.prob_w(index_path,"#{w[:word]} #uw10(#{label})")
+      res + e.prob*Math.log(Context.prob_w(index_path,"#{e.word} #uw10(#{label})")/(e.p_in_coll*Context.prob_w(index_path,label)))
+    end
+
+    s
+  end
+
+  def concept_words_similarity s,index_path=Context::IndexPaths[:wiki_en]
+#    inter = @elements.collect { |w| w unless (w.word & s).empty? }
+    inter = self.words & s.words
+
+    sim = (inter.count/self.words.count.to_f) * inter.inject(0.0) { |sum,w| sum + Math.log(Context.df index_path,w) }
+    sim
+  end
+
+  def weighted_concept_similarity s,index_path=Context::IndexPaths[:wiki_en]
+    inter = self.words & s.words
+    sim = (inter.count/self.words.count.to_f)
+    sim *= @elements.inject(0.0) do |sum,w|
+      wp = s.get_element_from_word w.word
+
+      s.words.include?(w.word) ? sum + wp.prob*w.prob*Math.log(Context.df index_path,w.word) : sum + 0.0
+    end
+    sim
+  end
+
+  def get_element_from_word w
+    @elements.select { |e| e.word == w }.first
+  end
+
+  def words
+    @elements.collect { |w| w.word }
+  end
+
+  def word_probs
+    res = {}
+    @elements.each { |w| res[w.word] = w.prob }
+    res
+  end
+
+# From papers :
+#  NAACL'10: `Automatic Evaluation of Topic Coherence`
+#  EMNLP'12: `Exploring Topic Coherence over many models and many topics`
+#
+  def uci_coherence epsilon=1,index_path=Context::IndexPaths[:wiki_en]
+    coherence = @elements.combination(2).inject(0.0) do |res,bigram|
+#Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)*
+      t = (Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)+epsilon)/((bigram.first.p_in_coll)*(bigram.last.p_in_coll))
+      res + Math.log(t)
+    end
+
+    coherence /= @elements.count*(@elements.count-1)
+    coherence
+  end
+
+  def uci_coherence_entropy index_path=Context::IndexPaths[:wiki_en]
+    coherence = @elements.combination(2).inject(0.0) do |res,bigram|
+#Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)*
+      t = (Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20))/((bigram.first.p_in_coll)*(bigram.last.p_in_coll))
+      res + t
+    end
+
+    coherence /= @elements.count*(@elements.count-1)
+    coherence
+  end
+
+  protected
+  def update_coherence index_path=Context::IndexPaths[:wiki_en]
+    coherence = @elements.combination(2).inject(0.0) do |res,bigram|
+      res + Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}")*Math.log(Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}")/((bigram.first.p_in_coll index_path)*(bigram.last.p_in_coll index_path)))
+    end
+
+    coherence /= @elements.count*(@elements.count-1)
+    @coherence = coherence
+  end
+
+  def update_feedback_coherence documents 
+    corpus = Mirimiri::Document.new documents.join " "
+
+    windows = corpus.ngrams(10).collect { |w| w.split }
+
+    coherence = @elements.combination(2).inject(0.0) do |res,bigram|
+      big_prob = windows.count{ |c| c.include?(bigram.first.word) && c.include?(bigram.last.word) }.to_f/windows.count
+      mi = big_prob.zero? ? 0.0 : big_prob*bigram.first.prob*bigram.last.prob*Math.log(big_prob/(corpus.tf(bigram.first.word)*corpus.tf(bigram.last.word)))
+      res + mi
+    end 
+
+    coherence /= @elements.count*(@elements.count-1)
+    @coherence = coherence
+  end
+
+end
+#!/usr/bin/env ruby
+
+require 'lda-ruby'
+require 'peach'
+
+class ConceptModel
+  attr_reader :concepts,:documents,:source,:nbdocs,:nbterms,:query,:total_coherence,:doc_scores,:doc_names,:theta,:entropy_coherence,:avg_coherence
+
+  def ConceptModel.parse_hdp str
+    concepts = []
+    eval(str).each do |hdp_top|
+      c = Concept.new
+      hdp_top.gsub(/topic \d: /,'').split(" + ").each do |words|
+        ee = words.split('*') 
+        begin
+          e = ConceptualElement.new ee[1],ee[0].to_f
+          c << e 
+        rescue ArgumentError
+          next
+        end
+      end
+
+      concepts << c
+    end
+    concepts
+  end
+
+  def initialize query,source,nb_docs,nb_terms=10,k=false
+    raise ArgumentError, 'Argument 1 must be a String.' unless query.is_a? String
+    raise ArgumentError, 'Argument 2 must be a valid Index key.' unless Context::IndexPaths.has_key?(source.to_sym)
+
+    @source  = source.to_sym
+    @nbdocs  = nb_docs
+    @nbterms = nb_terms
+    @query   = query
+    @concepts = []
+    @total_coherence = 0.0
+
+    corpus = Lda::Corpus.new
+
+    @documents,@doc_scores,@doc_names = Context.feedback_docs Context::IndexPaths[@source],@query,@nbdocs
+    @documents.each do |d|
+      doc = Lda::TextDocument.new corpus,d
+      corpus.add_document doc
+    end
+
+    if k == false
+      num_topics = topic_divergence corpus
+    else
+      num_topics = k
+    end
+
+    lda = Lda::Lda.new corpus
+    lda.verbose=false
+    lda.num_topics = num_topics
+
+    lda.em('random')
+
+    @beta  = lda.beta   # to avoid repeated expensive computation
+    @vocab  = lda.vocab  #
+
+    @theta = lda.compute_topic_document_probability
+
+# Normalizing the phi_t(w) weights for each topic
+#
+    total_prob = {}
+    tmp_top_word_indices(@nbterms,@vocab,@beta).each_pair do |topic,indices|
+      total_prob[topic] = indices.inject(0.0) { |res,i| res + Math.exp(@beta[topic][i].to_f) }
+    end
+
+    tmp_top_word_indices(@nbterms,@vocab,@beta).each_pair do |topic,indices|
+      c = Concept.new
+      indices.each do |i| 
+        begin
+          e = ConceptualElement.new @vocab[i],(Math.exp(@beta[topic][i].to_f)/total_prob[topic])
+          c << e
+        rescue ArgumentError
+          next
+        end
+      end
+
+      c.compute_coherence @doc_scores,@theta,topic
+
+#      c.compute_coherence @doc_scores,gamma_m,topic # takes time since it has to compute several probabilities
+      @concepts << c
+      @total_coherence += c.coherence
+    end
+  end
+
+  def to_s
+    @concepts.collect do |c|
+      "#{c.coherence/@total_coherence} => [#{c.elements.collect do |e|
+        "#{e.prob} #{e.word}"
+      end.join(', ')
+      }]"
+    end.join "\n"
+  end
+
+  def to_indriq
+    "#weight( #{@concepts.collect do |c|
+      "#{c.coherence/@total_coherence} #weight ( #{c.elements.collect do |e|
+        "#{e.prob} #{e.word}"
+      end.join(' ')
+      } ) "
+    end.join " "} )"
+  end
+
+  def <<(concept)
+    raise ArgumentError, 'Argument must be a Concept.' unless elem.is_a? Concept
+    @concepts << concept
+  end
+
+  def avg_model_coherence
+    if @documents.empty?
+      @avg_coherence = 0.0 
+    else
+      @avg_coherence = @concepts.inject(0.0) { |res,c| res + c.uci_coherence }/@concepts.count #if @avg_coherence.nil?
+    end
+    @avg_coherence
+  end
+
+  def entropy_model_coherence
+    if @documents.empty?
+      @entropy_coherence = 0.0 
+    else  
+      @entropy_coherence = @concepts.inject(0.0) do |res,c| 
+        ent = c.uci_coherence_entropy
+        ent += 0.0000000000000000000000001 if ent.zero?
+        res + ent*Math.log(ent)
+      end #if @entropy_coherence.nil?
+    end
+    @entropy_coherence
+  end
+
+  private
+  def topic_divergence corpus
+    max_kl = 0.0
+# Old trick to limit number of iterations
+#    num_p = @nbdocs < 6 ? @nbdocs + 5 : @nbdocs 
+
+    semaphore = Mutex.new
+
+    1.upto(20).inject do |k,ntop|
+#    1.upto(num_p).inject do |k,ntop|
+      lda = Lda::Lda.new corpus
+      lda.verbose=false
+      lda.num_topics = ntop
+      lda.em('random')
+      beta_m = lda.beta   # to avoid repeated expensive computation
+      vocab  = lda.vocab
+
+      topics_i = Array.new(ntop) { |i| i }
+
+      sum_kl = topics_i.combination(2).inject(0.0) do |kl,topics|
+        ti = topics.first
+        tj = topics.last
+        begin
+          kl + 0.upto(vocab.count-1).inject(0.0) do |res,w_i| 
+            res + ( Math.exp(beta_m[ti][w_i])*Math.log(Math.exp(beta_m[ti][w_i])/Math.exp(beta_m[tj][w_i])) + Math.exp(beta_m[tj][w_i])*Math.log(Math.exp(beta_m[tj][w_i])/Math.exp(beta_m[ti][w_i])) )
+          end
+        rescue
+          kl + 0.0
+        end
+      end
+
+      sum_kl /= ntop*(ntop-1)
+      sum_kl = max_kl if sum_kl.nan? || sum_kl.infinite? 
+
+      sum_kl <= max_kl ? k : (max_kl = sum_kl and ntop)
+    end
+  end
+
+  def tmp_top_word_indices(words_per_topic = 10,vocab,beta)
+    raise 'No vocabulary loaded.' unless vocab
+
+    # find the highest scoring words per topic
+    topics = Hash.new
+    indices = (0...vocab.size).to_a
+
+    beta.each_with_index do |topic, topic_num|
+      topics[topic_num] = (topic.zip((0...vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
+    end
+
+    topics
+  end
+
+end
+#!/usr/bin/env ruby
+
+class ConceptualElement
+  attr_reader :word, :prob
+
+  def initialize w,s  
+    raise ArgumentError, 'Argument 1 must be a String.' unless w.is_a? String
+    raise ArgumentError, 'Argument 2 must be a Float.'  unless s.is_a? Float
+
+    tmp = w.gsub(/(-)\1+/,'-').gsub(/([^\w-].*|^-|-$)/,'')
+    raise ArgumentError, 'Arguments 1 is not a useful word ! ;)' if tmp.is_stopword? || tmp.size < 2
+
+    @word = tmp
+    @prob = s
+  end
+
+  def p_in_coll index_path=Context::IndexPaths[:wiki_en],size=20
+    Context.prob_w index_path,@word,size
+  end
+end
+#!/usr/bin/env ruby
+
+class QueryContext < Array
+
+  def best_concept_model
+    max_sim = 0.0
+    best    = nil
+
+    for p in 0...self.count
+      sim = 0.0
+      for pp in 0...self.count
+        next if pp == p
+        combs = self.at(p).concepts.product self.at(pp).concepts
+        sum_sim = combs.inject(0.0) { |sum,k| sum + k.first.weighted_concept_similarity(k.last) }
+        sim += sum_sim/combs.count
+      end
+
+
+      if sim > max_sim
+        max_sim = sim
+        best = p
+      end
+    end
+    
+    best.nil? ? nil : self.at(best)
+  end
+
+  def best_concept_model_word
+    max_sim = 0.0
+    best    = nil
+
+    for p in 0...self.count
+      sim = 0.0
+      for pp in 0...self.count
+        next if pp == p
+        combs = self.at(p).concepts.product self.at(pp).concepts
+        sum_sim = combs.inject(0.0) { |sum,k| sum + k.first.concept_words_similarity(k.last) }
+        sim += sum_sim/combs.count
+      end
+
+
+      if sim > max_sim
+        max_sim = sim
+        best = p
+      end
+    end
+    
+    best.nil? ? nil : self.at(best)
+  end
+end
...	...	@@ -5,6 +5,8 @@
5	5	Stand-alone functions built on top of mirimiri and lda-ruby.
6	6	Aiming to extract contextual features from general corpora related to search scenario.
7	7
	8	+This code was used to produce the results reported in our OAIR'13 paper : "Unsupervised Latent Concept Modeling to Identify Query Facets".
	9	+
8	10	License
9	11	=======
10	12
1		-#!/usr/bin/env ruby
2		-
3		-require 'mirimiri'
4		-require 'sanitize'
5		-require 'lda-ruby'
6		-
7		-module Context
8		- IndexPaths = {
9		- :web_en => '/mnt/disk2/ClueWeb09_English_1_sDocs',
10		- :web_fr => '/mnt/disk2/ClueWeb09_French_1_sDocs',
11		- :web_nospam => '/mnt/disk1/ClueWeb09_English_1noSpam',
12		- :gigaword => '/local/data/GigaWord/index',
13		- :nyt => '/local/data/NYT_index',
14		- :wiki_en => '/local/data/WikiEn_index',
15		- :wiki_fr => '/local/data/WikiFr_index'
16		- }
17		-
18		- def Context.term_context index_path,query,size,num_page,args={}
19		- args[:func] \|\|= :entropy
20		- args[:window] \|\|= 1
21		-
22		- docs = self.feedback_docs index_path,query,num_page
23		-
24		- resource = Mirimiri::Document.new docs.join(' ')
25		- terms = self.extract_ngrams resource,args[:func].to_sym,args[:window]
26		-
27		- context = "#weight ( #{terms.compact.sort{ \|a,b\| b[0] <=> a[0]}[0,size].collect { \|e\| "#{e[0]} #1(#{e[1]})" }.join " "} ) " unless terms.empty?
28		-
29		- context
30		- end
31		-
32		- def Context.topic_context index_path,query,size,num_page,args={}
33		- corpus = Lda::Corpus.new
34		-
35		- docs = self.feedback_docs index_path,query,num_page
36		- docs.each do \|d\|
37		- doc = Lda::TextDocument.new corpus,d
38		- corpus.add_document doc
39		- end
40		-
41		- lda = Lda::Lda.new corpus
42		- lda.num_topics = num_page/10
43		- lda.em 'random'
44		- puts lda.top_words(size)
45		- end
46		-
47		- private
48		- def Context.feedback_docs index_path,query,num_page
49		- query = Indri::IndriQuery.new({:query => query, :count => num_page},"-printDocuments=true -trecFormat=true")
50		- index = Indri::IndriIndex.new index_path
51		- idocs = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))
52		-
53		- docs = idocs.extract_docs.collect { \|idoc\| Sanitize.clean idoc,:remove_contents => ['script'] }
54		- docs
55		- end
56		-
57		- def Context.extract_ngrams resource,func,n
58		- raw_terms = 1.upto(n).collect { \|i\| resource.ngrams(i) }.flatten
59		- terms = raw_terms.uniq.collect { \|w\| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? \|\| w.split.all? { \|e\| e.length <= 1 } \|\| w.split.all? { \|e\| e !~ /[a-zA-Z]/ } \|\| w.include?(".") }
60		- terms
61		- end
62		-
63		-end
	1	+#!/usr/bin/env ruby
	2	+
	3	+require 'mirimiri'
	4	+require 'sanitize'
	5	+require 'lda-ruby'
	6	+require 'context/conceptual_element'
	7	+require 'context/concept_model'
	8	+require 'context/concept'
	9	+require 'context/query_context'
	10	+
	11	+
	12	+module Context
	13	+ @@count = Hash.new { \|h,k\| h[k] = {} }
	14	+ @@df = Hash.new { \|h,k\| h[k] = {} }
	15	+ @@semaphore = Mutex.new
	16	+
	17	+ IndexPaths = {
	18	+ :web_en => '/mnt/disk2/ClueWeb09_English_1_sDocs',
	19	+ :web_fr => '/mnt/disk2/ClueWeb09_French_1_sDocs',
	20	+ :web_nospam => '/mnt/disk1/ClueWeb09_English_1noSpam',
	21	+ :robust => '/mnt/disk5/Robust04/',
	22	+ :wt10g => '/mnt/disk3/WT10g_index',
	23	+ :gov2 => '/mnt/disk3/GOV2_index',
	24	+ :gigaword => '/local/data/GigaWord/index',
	25	+ :nyt => '/local/data/NYT_index',
	26	+ :wiki_en => '/local/data/WikiEn_index',
	27	+ :wiki_en2012 => '/local/data/WikiEn2012_index',
	28	+ :wiki_fr => '/local/data/WikiFr_index',
	29	+ :wiki_tc2012 => '/local/data/INEXQA2012index',
	30	+ :books => '/local/data/INEX/Books2011/indexedit',
	31	+ :ent => '/home/sanjuan/works/nist_eval/csiro_indri.ind'
	32	+ }
	33	+
	34	+ IndexPathsCaracole = {
	35	+ :web_en => '/distant/index_clueweb/disk2/ClueWeb09_English_1_sDocs',
	36	+ :web_nospam => '/distant/index_clueweb/disk1/ClueWeb09_English_1noSpam',
	37	+ :robust => '/distant/data/Robust04',
	38	+ :wt10g => '/distant/index_clueweb/disk3/WT10g_index',
	39	+ :gov2 => '/distant/index_clueweb/disk3/GOV2_index'
	40	+ }
	41	+
	42	+ # #
	43	+ # From the SIGKDD 2007 paper : "Exploiting underrepresented query aspects for automatic query expansion"
	44	+ def Context.query_aspects q
	45	+ query = Mirimiri::Document.new q
	46	+
	47	+ 2.upto(query.words.count) do \|size\|
	48	+ query.ngrams(size).each do \|s\|
	49	+ dp = Context.count_w Context::IndexPaths[:wiki_en2012],"#1(#{s})"
	50	+ d = Context.count_w Context::IndexPaths[:wiki_en2012],"#{s}",100000000
	51	+ p s
	52	+
	53	+ denum = s.split.permutation.inject(0.0) do \|res,p\|
	54	+ tmp = (p == s.split) ? 0 : Context.count_w(Context::IndexPaths[:wiki_en2012],"#1(#{p.join(" ")})")
	55	+ res + tmp
	56	+ end
	57	+
	58	+ existence = dp.to_f/d
	59	+ support = dp.to_f/denum
	60	+ puts "#{s} ===> #{existence*support}"
	61	+ end
	62	+ end
	63	+ end
	64	+
	65	+ # #
	66	+ # From the CIKM 2007 paper : "Ranking Very Many Typed Entities on Wikipedia"
	67	+ #
	68	+ # The ``entities`` parameter is currently an array of strings. Could be moved
	69	+ # to an array of Entity objects.
	70	+ def Context.entity_web_ranking query,entities,nbdocs=100,index='web_nospam'
	71	+ q = Indri::IndriQuery.new({:query => "#combine ( #{query} )", :count => nbdocs},"-trecFormat=true")
	72	+ indri_index = Indri::IndriIndex.new IndexPaths[index.to_sym]
	73	+ docs = indri_index.runquery(q).force_encoding("ISO-8859-1").encode("UTF-8")
	74	+ query_list = docs.split("\n").collect { \|p\| p.scan(/\d+ Q0 (.+) \d+ -\d+.\d+ .+/).first }.flatten
	75	+
	76	+ res = entities.pmap(15) do \|e\|
	77	+ eq = Indri::IndriQuery.new({:query => "#combine ( #{e.gsub(/[^a-zA-Z0-9\s]/,'')} )", :count => nbdocs},"-trecFormat=true")
	78	+ edocs = indri_index.runquery(eq).force_encoding("ISO-8859-1").encode("UTF-8")
	79	+ e_list = edocs.split("\n").collect { \|p\| p.scan(/\d+ Q0 (.+) \d+ -\d+.\d+ .+/).first }.flatten
	80	+
	81	+ rels = e_list&query_list
	82	+
	83	+ ave_p = 1.upto(nbdocs).inject(0.0) do \|sum,k\|
	84	+ p = (e_list.first(k)&rels).count.to_f/k
	85	+ rel = rels.include?(e_list[k-1]) ? 1.0 : 0.0
	86	+ sum + p*rel
	87	+ end
	88	+
	89	+ {:name => e, :score => ave_p}
	90	+ end
	91	+
	92	+ res.sort { \|a,b\| b[:score] <=> a[:score] }
	93	+ end
	94	+
	95	+ def Context.query_entities query,nb_docs=10
	96	+ sources = ['wiki_en2012']
	97	+# sources = ['wiki_en2012','web_nospam','nyt','gigaword']
	98	+# sources = ['web_fr']
	99	+ sc = Hash.new { \|h,k\| h[k] = 0.0 }
	100	+
	101	+ sources.each do \|source\|
	102	+ puts " == source : #{source}"
	103	+ c = ConceptModel.new query,source,nb_docs
	104	+ p c.query
	105	+
	106	+ c.concepts.each do \|concept\|
	107	+ querys = concept.words[0,4].join " "
	108	+
	109	+ d1 = Context::label_candidate querys.sequential_dependence_model,'wiki_en'
	110	+ d2 = Context::label_candidate querys.sequential_dependence_model,'wiki_en2012'
	111	+ d3 = Mirimiri::WikipediaPage.search_wikipedia_titles querys
	112	+
	113	+ d1 = [] if d1.nil?
	114	+ d2 = [] if d2.nil?
	115	+ d3 = [] if d3.nil?
	116	+
	117	+ d = d2 & d3
	118	+ labels = d.collect { \|c\| c.downcase.gsub(/[^\w\d]/,' ') }
	119	+ p d
	120	+
	121	+
	122	+ mins = -10000000
	123	+ lab = nil
	124	+ scores = labels.collect do \|l\|
	125	+ s = concept.score_label l
	126	+ if s > mins
	127	+ mins = s
	128	+ lab = l
	129	+ end
	130	+ sc[l] += s*(concept.coherence/c.total_coherence)
	131	+ { :label => l, :score => s }
	132	+ end
	133	+
	134	+ print (concept.coherence/c.total_coherence).to_s+" <= "
	135	+ p concept.elements.collect { \|c\| c.word }
	136	+ end
	137	+ end
	138	+
	139	+ sc.sort { \|a,b\| b[1] <=> a[1] }
	140	+ end
	141	+
	142	+ def Context.label_candidate query,index,nb_candidates,rm3=false
	143	+# Mirimiri::WikipediaPage.search_wikipedia_titles query
	144	+ args = rm3 ? "-fbDocs=20 -fbTerms=30 -fbMu=2000 -fbOrigWeight=0.7" : ""
	145	+ q = Indri::IndriQuery.new({:query => query, :count => nb_candidates},"-printDocuments=true -trecFormat=true #{args}")
	146	+ index = Indri::IndriIndex.new IndexPaths[index.to_sym]
	147	+ docs = index.runquery q
	148	+ docs = docs.force_encoding("ISO-8859-1").encode("UTF-8") if ['web_fr','web_en','web_nospam'].include? index
	149	+ idocs = Indri::IndriPrintedDocuments.new(docs)
	150	+
	151	+ wiki_titles = idocs.extract_docs.collect do \|d\|
	152	+ t = Nokogiri::HTML d
	153	+ t.xpath('//title').text
	154	+ end
	155	+
	156	+ wiki_titles
	157	+ end
	158	+
	159	+ def Context.lcm query
	160	+ source = 'nyt'
	161	+
	162	+ a = Time.now
	163	+ qc = QueryContext.new(1.upto(20).collect do \|nb_docs\|
	164	+ beg = Time.now
	165	+ c = ConceptModel.new query,source,nb_docs
	166	+ puts "#{nb_docs} ==> Time elapsed: #{Time.now-beg} seconds"
	167	+ c
	168	+ end)
	169	+ puts "All concepts : #{Time.now-a} seconds"
	170	+
	171	+ model = qc.best_concept_model
	172	+ puts "Total : #{Time.now-a} seconds"
	173	+ model
	174	+ end
	175	+
	176	+ def Context.term_context index_path,query,size,num_page,args={}
	177	+ terms = self.term_concepts index_path,query,size,num_page,args
	178	+ args[:window] \|\|= 1
	179	+
	180	+# context = "#weight ( #{terms.compact.sort{ \|a,b\| b[0] <=> a[0]}[0,size].collect { \|e\| "#{e[0]} #1(#{e[1]})" }.join " "} ) " unless terms.empty?
	181	+ context = "#weight ( #{terms.collect { \|c\| "#{"%.10f" % c[:score]} #uw#{args[:window]}(#{c[:concept]})" }.join " "} ) " unless terms.empty?
	182	+
	183	+ context
	184	+ end
	185	+
	186	+# From SIGIR'06 paper : `Improving the estimation of relevance models using large external corpora`
	187	+#
	188	+ def Context.morm index_path,query,size,num_page
	189	+ docs,scores,names = self.feedback_docs index_path,query,num_page
	190	+
	191	+ terms = []
	192	+
	193	+ docs.each do \|d\|
	194	+ r = Mirimiri::Document.new d
	195	+ tmp = self.extract_ngrams r,:tf,1
	196	+ terms += tmp.compact.collect { \|t\| [t[0]*Math.exp(scores[docs.index(d)].to_f),t[1]] }
	197	+ end
	198	+
	199	+ final = terms.compact.sort{ \|a,b\| b[0] <=> a[0]}[0,size].collect { \|e\| { :score => e[0], :concept => e[1] } }
	200	+ context = "#weight ( #{final.collect { \|c\| "#{"%.10f" % c[:score]} #1(#{c[:concept]})" }.join " "} ) " unless terms.empty?
	201	+
	202	+ context
	203	+ end
	204	+
	205	+ def Context.term_concepts index_path,query,size,num_page,args={}
	206	+ args[:func] \|\|= :entropy
	207	+ args[:window] \|\|= 1
	208	+
	209	+ docs = self.feedback_docs index_path,query,num_page
	210	+
	211	+ resource = Mirimiri::Document.new docs.join(' ')
	212	+ terms = self.extract_ngrams resource,args[:func].to_sym,args[:window]
	213	+
	214	+ terms.compact.sort{ \|a,b\| b[0] <=> a[0]}[0,size].collect { \|e\| { :score => e[0], :concept => e[1] } }
	215	+ end
	216	+
	217	+
	218	+ def Context.sentence_similarity s1,s2,index_path
	219	+ q = s1.is_a?(String) ? s1.split : s1
	220	+ r = s2.is_a?(String) ? s2.split : s2
	221	+
	222	+ inter = q & r
	223	+
	224	+ s = (inter.count/q.count.to_f) * inter.inject(0.0) { \|sum,w\| sum + Math.log(Context.df index_path,w) }
	225	+ s
	226	+ end
	227	+
	228	+
	229	+ private
	230	+
	231	+ def Context.df index_path,w,window=1
	232	+ if @@count[index_path]["total#{window}"].nil?
	233	+ total = `dumpindex #{index_path} s`.match(/total terms:\t(.*)/)[1].to_f-(window-1).to_f
	234	+ @@semaphore.synchronize {
	235	+ @@count[index_path]["total#{window}"] = total
	236	+ }
	237	+ end
	238	+
	239	+ if @@df[index_path]["#uw#{window}(#{w})"].nil?
	240	+ nb = `dumpindex #{index_path} e "#uw#{window}(#{w})" \| awk ' { arr[$1]=$0 } END { for ( key in arr ) { print arr[key] } } ' \| wc -l`.chomp.split(':').last.to_f - 1
	241	+ @@semaphore.synchronize {
	242	+ @@df[index_path]["#uw#{window}(#{w})"] = nb+1.0
	243	+ }
	244	+ end
	245	+ begin
	246	+ d = @@count[index_path]["total#{window}"]/@@df[index_path]["#uw#{window}(#{w})"]
	247	+ rescue
	248	+ puts w
	249	+ exit
	250	+ end
	251	+ d
	252	+ end
	253	+
	254	+ def Context.prob_w index_path,w,window=1
	255	+ if @@count[index_path]["total#{window}"].nil?
	256	+ total = `dumpindex #{index_path} s`.match(/total terms:\t(.*)/)[1].to_f-(window-1).to_f
	257	+ @@semaphore.synchronize {
	258	+ @@count[index_path]["total#{window}"] = total+1.0
	259	+ }
	260	+ end
	261	+
	262	+ nb = self.count_w index_path,w,window
	263	+ nb/@@count[index_path]["total#{window}"]
	264	+ end
	265	+
	266	+ def Context.count_w index_path,w,window=1
	267	+ if @@count[index_path]["##{window}(#{w})"].nil?
	268	+ nb = `dumpindex #{index_path} x "##{window}(#{w})"`.chomp.split(':').last.to_f
	269	+ @@semaphore.synchronize {
	270	+ @@count[index_path]["##{window}(#{w})"] = ("%.15f" % nb).to_f+1.0
	271	+ }
	272	+ end
	273	+ @@count[index_path]["##{window}(#{w})"]
	274	+ end
	275	+
	276	+
	277	+ public
	278	+ def Context.extract_ngrams resource,func,n
	279	+ raw_terms = 1.upto(n).collect { \|i\| resource.ngrams(i) }.flatten
	280	+# raw_terms = resource.ngrams(n).flatten
	281	+ terms = raw_terms.uniq.collect { \|w\| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? \|\| w.split.any? { \|e\| e.length <= 2 } \|\| w.split.any? { \|e\| e !~ /[a-zA-Z]/ } \|\| w.include?(".") \|\| (Mirimiri::Stoplist&w.unaccent.split).count >= 2 }
	282	+# terms = raw_terms.uniq.collect { \|w\| w=w.gsub(/\W/,' ').strip; [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? \|\| w.split.any? { \|e\| e.length <= 2 } \|\| w.split.any? { \|e\| e !~ /[a-zA-Z]/ } \|\| w.include?(".") \|\| (Mirimiri::Stoplist&w.unaccent.split).count >= 1 }
	283	+# terms = raw_terms.uniq.collect { \|w\| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? \|\| w.split.all? { \|e\| e.length <= 1 } \|\| w.split.all? { \|e\| e !~ /[a-zA-Z]/ } \|\| w.include?(".") }
	284	+ terms
	285	+ end
	286	+
	287	+ def Context.feedback_docs index_path,query,num_page
	288	+ query = Indri::IndriQuery.new({:query => query, :count => num_page},"-printDocuments=true -trecFormat=true")
	289	+ index = Indri::IndriIndex.new index_path
	290	+ idocs = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))
	291	+
	292	+ texts,scores,names = idocs.extract_docs_score
	293	+
	294	+ docs = texts.collect do \|idoc\|
	295	+ begin
	296	+ Sanitize.clean idoc,:remove_contents => ['script','style']
	297	+ rescue
	298	+ d = Nokogiri::HTML(idoc)
	299	+ d.xpath('//text()').text
	300	+ end
	301	+ end
	302	+
	303	+ return docs,scores,names
	304	+ end
	305	+
	306	+end
	1	+#!/usr/bin/env ruby
	2	+
	3	+class Concept
	4	+ attr_reader :elements, :coherence
	5	+
	6	+ def initialize
	7	+ @elements = []
	8	+ @coherence = 0
	9	+ end
	10	+
	11	+ def <<(elem)
	12	+ raise ArgumentError, 'Argument must be a ConceptualElement.' unless elem.is_a? ConceptualElement
	13	+ @elements << elem
	14	+ end
	15	+
	16	+ def compute_coherence scores,theta,k#arg=nil
	17	+# update_feedback_coherence arg
	18	+ @coherence = 0.upto(theta.count-1).inject(0.0) do \|sum,i\|
	19	+ sum + Math.exp(theta[i][k])*Math.exp(scores[i].to_f)
	20	+ end
	21	+ end
	22	+
	23	+ def score_label label,index_path=Context::IndexPaths[:wiki_en2012]
	24	+ s = @elements.inject(0.0) do \|res,e\|
	25	+# *self.prob_w(index_path,"#{w[:word]} #uw10(#{label})")
	26	+ res + e.probMath.log(Context.prob_w(index_path,"#{e.word} #uw10(#{label})")/(e.p_in_collContext.prob_w(index_path,label)))
	27	+ end
	28	+
	29	+ s
	30	+ end
	31	+
	32	+ def concept_words_similarity s,index_path=Context::IndexPaths[:wiki_en]
	33	+# inter = @elements.collect { \|w\| w unless (w.word & s).empty? }
	34	+ inter = self.words & s.words
	35	+
	36	+ sim = (inter.count/self.words.count.to_f) * inter.inject(0.0) { \|sum,w\| sum + Math.log(Context.df index_path,w) }
	37	+ sim
	38	+ end
	39	+
	40	+ def weighted_concept_similarity s,index_path=Context::IndexPaths[:wiki_en]
	41	+ inter = self.words & s.words
	42	+ sim = (inter.count/self.words.count.to_f)
	43	+ sim *= @elements.inject(0.0) do \|sum,w\|
	44	+ wp = s.get_element_from_word w.word
	45	+
	46	+ s.words.include?(w.word) ? sum + wp.probw.probMath.log(Context.df index_path,w.word) : sum + 0.0
	47	+ end
	48	+ sim
	49	+ end
	50	+
	51	+ def get_element_from_word w
	52	+ @elements.select { \|e\| e.word == w }.first
	53	+ end
	54	+
	55	+ def words
	56	+ @elements.collect { \|w\| w.word }
	57	+ end
	58	+
	59	+ def word_probs
	60	+ res = {}
	61	+ @elements.each { \|w\| res[w.word] = w.prob }
	62	+ res
	63	+ end
	64	+
	65	+# From papers :
	66	+# NAACL'10: `Automatic Evaluation of Topic Coherence`
	67	+# EMNLP'12: `Exploring Topic Coherence over many models and many topics`
	68	+#
	69	+ def uci_coherence epsilon=1,index_path=Context::IndexPaths[:wiki_en]
	70	+ coherence = @elements.combination(2).inject(0.0) do \|res,bigram\|
	71	+#Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)*
	72	+ t = (Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)+epsilon)/((bigram.first.p_in_coll)*(bigram.last.p_in_coll))
	73	+ res + Math.log(t)
	74	+ end
	75	+
	76	+ coherence /= @elements.count*(@elements.count-1)
	77	+ coherence
	78	+ end
	79	+
	80	+ def uci_coherence_entropy index_path=Context::IndexPaths[:wiki_en]
	81	+ coherence = @elements.combination(2).inject(0.0) do \|res,bigram\|
	82	+#Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)*
	83	+ t = (Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20))/((bigram.first.p_in_coll)*(bigram.last.p_in_coll))
	84	+ res + t
	85	+ end
	86	+
	87	+ coherence /= @elements.count*(@elements.count-1)
	88	+ coherence
	89	+ end
	90	+
	91	+ protected
	92	+ def update_coherence index_path=Context::IndexPaths[:wiki_en]
	93	+ coherence = @elements.combination(2).inject(0.0) do \|res,bigram\|
	94	+ res + Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}")Math.log(Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}")/((bigram.first.p_in_coll index_path)(bigram.last.p_in_coll index_path)))
	95	+ end
	96	+
	97	+ coherence /= @elements.count*(@elements.count-1)
	98	+ @coherence = coherence
	99	+ end
	100	+
	101	+ def update_feedback_coherence documents
	102	+ corpus = Mirimiri::Document.new documents.join " "
	103	+
	104	+ windows = corpus.ngrams(10).collect { \|w\| w.split }
	105	+
	106	+ coherence = @elements.combination(2).inject(0.0) do \|res,bigram\|
	107	+ big_prob = windows.count{ \|c\| c.include?(bigram.first.word) && c.include?(bigram.last.word) }.to_f/windows.count
	108	+ mi = big_prob.zero? ? 0.0 : big_probbigram.first.probbigram.last.probMath.log(big_prob/(corpus.tf(bigram.first.word)corpus.tf(bigram.last.word)))
	109	+ res + mi
	110	+ end
	111	+
	112	+ coherence /= @elements.count*(@elements.count-1)
	113	+ @coherence = coherence
	114	+ end
	115	+
	116	+end
	1	+#!/usr/bin/env ruby
	2	+
	3	+require 'lda-ruby'
	4	+require 'peach'
	5	+
	6	+class ConceptModel
	7	+ attr_reader :concepts,:documents,:source,:nbdocs,:nbterms,:query,:total_coherence,:doc_scores,:doc_names,:theta,:entropy_coherence,:avg_coherence
	8	+
	9	+ def ConceptModel.parse_hdp str
	10	+ concepts = []
	11	+ eval(str).each do \|hdp_top\|
	12	+ c = Concept.new
	13	+ hdp_top.gsub(/topic \d: /,'').split(" + ").each do \|words\|
	14	+ ee = words.split('*')
	15	+ begin
	16	+ e = ConceptualElement.new ee[1],ee[0].to_f
	17	+ c << e
	18	+ rescue ArgumentError
	19	+ next
	20	+ end
	21	+ end
	22	+
	23	+ concepts << c
	24	+ end
	25	+ concepts
	26	+ end
	27	+
	28	+ def initialize query,source,nb_docs,nb_terms=10,k=false
	29	+ raise ArgumentError, 'Argument 1 must be a String.' unless query.is_a? String
	30	+ raise ArgumentError, 'Argument 2 must be a valid Index key.' unless Context::IndexPaths.has_key?(source.to_sym)
	31	+
	32	+ @source = source.to_sym
	33	+ @nbdocs = nb_docs
	34	+ @nbterms = nb_terms
	35	+ @query = query
	36	+ @concepts = []
	37	+ @total_coherence = 0.0
	38	+
	39	+ corpus = Lda::Corpus.new
	40	+
	41	+ @documents,@doc_scores,@doc_names = Context.feedback_docs Context::IndexPaths[@source],@query,@nbdocs
	42	+ @documents.each do \|d\|
	43	+ doc = Lda::TextDocument.new corpus,d
	44	+ corpus.add_document doc
	45	+ end
	46	+
	47	+ if k == false
	48	+ num_topics = topic_divergence corpus
	49	+ else
	50	+ num_topics = k
	51	+ end
	52	+
	53	+ lda = Lda::Lda.new corpus
	54	+ lda.verbose=false
	55	+ lda.num_topics = num_topics
	56	+
	57	+ lda.em('random')
	58	+
	59	+ @beta = lda.beta # to avoid repeated expensive computation
	60	+ @vocab = lda.vocab #
	61	+
	62	+ @theta = lda.compute_topic_document_probability
	63	+
	64	+# Normalizing the phi_t(w) weights for each topic
	65	+#
	66	+ total_prob = {}
	67	+ tmp_top_word_indices(@nbterms,@vocab,@beta).each_pair do \|topic,indices\|
	68	+ total_prob[topic] = indices.inject(0.0) { \|res,i\| res + Math.exp(@beta[topic][i].to_f) }
	69	+ end
	70	+
	71	+ tmp_top_word_indices(@nbterms,@vocab,@beta).each_pair do \|topic,indices\|
	72	+ c = Concept.new
	73	+ indices.each do \|i\|
	74	+ begin
	75	+ e = ConceptualElement.new @vocab[i],(Math.exp(@beta[topic][i].to_f)/total_prob[topic])
	76	+ c << e
	77	+ rescue ArgumentError
	78	+ next
	79	+ end
	80	+ end
	81	+
	82	+ c.compute_coherence @doc_scores,@theta,topic
	83	+
	84	+# c.compute_coherence @doc_scores,gamma_m,topic # takes time since it has to compute several probabilities
	85	+ @concepts << c
	86	+ @total_coherence += c.coherence
	87	+ end
	88	+ end
	89	+
	90	+ def to_s
	91	+ @concepts.collect do \|c\|
	92	+ "#{c.coherence/@total_coherence} => [#{c.elements.collect do \|e\|
	93	+ "#{e.prob} #{e.word}"
	94	+ end.join(', ')
	95	+ }]"
	96	+ end.join "\n"
	97	+ end
	98	+
	99	+ def to_indriq
	100	+ "#weight( #{@concepts.collect do \|c\|
	101	+ "#{c.coherence/@total_coherence} #weight ( #{c.elements.collect do \|e\|
	102	+ "#{e.prob} #{e.word}"
	103	+ end.join(' ')
	104	+ } ) "
	105	+ end.join " "} )"
	106	+ end
	107	+
	108	+ def <<(concept)
	109	+ raise ArgumentError, 'Argument must be a Concept.' unless elem.is_a? Concept
	110	+ @concepts << concept
	111	+ end
	112	+
	113	+ def avg_model_coherence
	114	+ if @documents.empty?
	115	+ @avg_coherence = 0.0
	116	+ else
	117	+ @avg_coherence = @concepts.inject(0.0) { \|res,c\| res + c.uci_coherence }/@concepts.count #if @avg_coherence.nil?
	118	+ end
	119	+ @avg_coherence
	120	+ end
	121	+
	122	+ def entropy_model_coherence
	123	+ if @documents.empty?
	124	+ @entropy_coherence = 0.0
	125	+ else
	126	+ @entropy_coherence = @concepts.inject(0.0) do \|res,c\|
	127	+ ent = c.uci_coherence_entropy
	128	+ ent += 0.0000000000000000000000001 if ent.zero?
	129	+ res + ent*Math.log(ent)
	130	+ end #if @entropy_coherence.nil?
	131	+ end
	132	+ @entropy_coherence
	133	+ end
	134	+
	135	+ private
	136	+ def topic_divergence corpus
	137	+ max_kl = 0.0
	138	+# Old trick to limit number of iterations
	139	+# num_p = @nbdocs < 6 ? @nbdocs + 5 : @nbdocs
	140	+
	141	+ semaphore = Mutex.new
	142	+
	143	+ 1.upto(20).inject do \|k,ntop\|
	144	+# 1.upto(num_p).inject do \|k,ntop\|
	145	+ lda = Lda::Lda.new corpus
	146	+ lda.verbose=false
	147	+ lda.num_topics = ntop
	148	+ lda.em('random')
	149	+ beta_m = lda.beta # to avoid repeated expensive computation
	150	+ vocab = lda.vocab
	151	+
	152	+ topics_i = Array.new(ntop) { \|i\| i }
	153	+
	154	+ sum_kl = topics_i.combination(2).inject(0.0) do \|kl,topics\|
	155	+ ti = topics.first
	156	+ tj = topics.last
	157	+ begin
	158	+ kl + 0.upto(vocab.count-1).inject(0.0) do \|res,w_i\|
	159	+ res + ( Math.exp(beta_m[ti][w_i])Math.log(Math.exp(beta_m[ti][w_i])/Math.exp(beta_m[tj][w_i])) + Math.exp(beta_m[tj][w_i])Math.log(Math.exp(beta_m[tj][w_i])/Math.exp(beta_m[ti][w_i])) )
	160	+ end
	161	+ rescue
	162	+ kl + 0.0
	163	+ end
	164	+ end
	165	+
	166	+ sum_kl /= ntop*(ntop-1)
	167	+ sum_kl = max_kl if sum_kl.nan? \|\| sum_kl.infinite?
	168	+
	169	+ sum_kl <= max_kl ? k : (max_kl = sum_kl and ntop)
	170	+ end
	171	+ end
	172	+
	173	+ def tmp_top_word_indices(words_per_topic = 10,vocab,beta)
	174	+ raise 'No vocabulary loaded.' unless vocab
	175	+
	176	+ # find the highest scoring words per topic
	177	+ topics = Hash.new
	178	+ indices = (0...vocab.size).to_a
	179	+
	180	+ beta.each_with_index do \|topic, topic_num\|
	181	+ topics[topic_num] = (topic.zip((0...vocab.size).to_a).sort { \|i, j\| i[0] <=> j[0] }.map { \|i, j\| j }.reverse)[0...words_per_topic]
	182	+ end
	183	+
	184	+ topics
	185	+ end
	186	+
	187	+end
	1	+#!/usr/bin/env ruby
	2	+
	3	+class ConceptualElement
	4	+ attr_reader :word, :prob
	5	+
	6	+ def initialize w,s
	7	+ raise ArgumentError, 'Argument 1 must be a String.' unless w.is_a? String
	8	+ raise ArgumentError, 'Argument 2 must be a Float.' unless s.is_a? Float
	9	+
	10	+ tmp = w.gsub(/(-)\1+/,'-').gsub(/([^\w-].*\|^-\|-$)/,'')
	11	+ raise ArgumentError, 'Arguments 1 is not a useful word ! ;)' if tmp.is_stopword? \|\| tmp.size < 2
	12	+
	13	+ @word = tmp
	14	+ @prob = s
	15	+ end
	16	+
	17	+ def p_in_coll index_path=Context::IndexPaths[:wiki_en],size=20
	18	+ Context.prob_w index_path,@word,size
	19	+ end
	20	+end
	1	+#!/usr/bin/env ruby
	2	+
	3	+class QueryContext < Array
	4	+
	5	+ def best_concept_model
	6	+ max_sim = 0.0
	7	+ best = nil
	8	+
	9	+ for p in 0...self.count
	10	+ sim = 0.0
	11	+ for pp in 0...self.count
	12	+ next if pp == p
	13	+ combs = self.at(p).concepts.product self.at(pp).concepts
	14	+ sum_sim = combs.inject(0.0) { \|sum,k\| sum + k.first.weighted_concept_similarity(k.last) }
	15	+ sim += sum_sim/combs.count
	16	+ end
	17	+
	18	+
	19	+ if sim > max_sim
	20	+ max_sim = sim
	21	+ best = p
	22	+ end
	23	+ end
	24	+
	25	+ best.nil? ? nil : self.at(best)
	26	+ end
	27	+
	28	+ def best_concept_model_word
	29	+ max_sim = 0.0
	30	+ best = nil
	31	+
	32	+ for p in 0...self.count
	33	+ sim = 0.0
	34	+ for pp in 0...self.count
	35	+ next if pp == p
	36	+ combs = self.at(p).concepts.product self.at(pp).concepts
	37	+ sum_sim = combs.inject(0.0) { \|sum,k\| sum + k.first.concept_words_similarity(k.last) }
	38	+ sim += sum_sim/combs.count
	39	+ end
	40	+
	41	+
	42	+ if sim > max_sim
	43	+ max_sim = sim
	44	+ best = p
	45	+ end
	46	+ end
	47	+
	48	+ best.nil? ? nil : self.at(best)
	49	+ end
	50	+end