concept_model.rb 5.02 KB
#!/usr/bin/env ruby

require 'lda-ruby'
require 'peach'

class ConceptModel
  attr_reader :concepts,:documents,:source,:nbdocs,:nbterms,:query,:total_coherence,:doc_scores,:doc_names,:theta,:entropy_coherence,:avg_coherence

  def ConceptModel.parse_hdp str
    concepts = []
    eval(str).each do |hdp_top|
      c = Concept.new
      hdp_top.gsub(/topic \d: /,'').split(" + ").each do |words|
        ee = words.split('*') 
        begin
          e = ConceptualElement.new ee[1],ee[0].to_f
          c << e 
        rescue ArgumentError
          next
        end
      end

      concepts << c
    end
    concepts
  end

  def initialize query,source,nb_docs,nb_terms=10,k=false
    raise ArgumentError, 'Argument 1 must be a String.' unless query.is_a? String
    raise ArgumentError, 'Argument 2 must be a valid Index key.' unless Context::IndexPaths.has_key?(source.to_sym)

    @source  = source.to_sym
    @nbdocs  = nb_docs
    @nbterms = nb_terms
    @query   = query
    @concepts = []
    @total_coherence = 0.0

    corpus = Lda::Corpus.new

    @documents,@doc_scores,@doc_names = Context.feedback_docs Context::IndexPaths[@source],@query,@nbdocs
    @documents.each do |d|
      doc = Lda::TextDocument.new corpus,d
      corpus.add_document doc
    end

    if k == false
      num_topics = topic_divergence corpus
    else
      num_topics = k
    end

    lda = Lda::Lda.new corpus
    lda.verbose=false
    lda.num_topics = num_topics

    lda.em('random')

    @beta  = lda.beta   # to avoid repeated expensive computation
    @vocab  = lda.vocab  #

    @theta = lda.compute_topic_document_probability

# Normalizing the phi_t(w) weights for each topic
#
    total_prob = {}
    tmp_top_word_indices(@nbterms,@vocab,@beta).each_pair do |topic,indices|
      total_prob[topic] = indices.inject(0.0) { |res,i| res + Math.exp(@beta[topic][i].to_f) }
    end

    tmp_top_word_indices(@nbterms,@vocab,@beta).each_pair do |topic,indices|
      c = Concept.new
      indices.each do |i| 
        begin
          e = ConceptualElement.new @vocab[i],(Math.exp(@beta[topic][i].to_f)/total_prob[topic])
          c << e
        rescue ArgumentError
          next
        end
      end

      c.compute_coherence @doc_scores,@theta,topic

#      c.compute_coherence @doc_scores,gamma_m,topic # takes time since it has to compute several probabilities
      @concepts << c
      @total_coherence += c.coherence
    end
  end

  def to_s
    @concepts.collect do |c|
      "#{c.coherence/@total_coherence} => [#{c.elements.collect do |e|
        "#{e.prob} #{e.word}"
      end.join(', ')
      }]"
    end.join "\n"
  end

  def to_indriq
    "#weight( #{@concepts.collect do |c|
      "#{c.coherence/@total_coherence} #weight ( #{c.elements.collect do |e|
        "#{e.prob} #{e.word}"
      end.join(' ')
      } ) "
    end.join " "} )"
  end

  def <<(concept)
    raise ArgumentError, 'Argument must be a Concept.' unless elem.is_a? Concept
    @concepts << concept
  end

  def avg_model_coherence
    if @documents.empty?
      @avg_coherence = 0.0 
    else
      @avg_coherence = @concepts.inject(0.0) { |res,c| res + c.uci_coherence }/@concepts.count #if @avg_coherence.nil?
    end
    @avg_coherence
  end

  def entropy_model_coherence
    if @documents.empty?
      @entropy_coherence = 0.0 
    else  
      @entropy_coherence = @concepts.inject(0.0) do |res,c| 
        ent = c.uci_coherence_entropy
        ent += 0.0000000000000000000000001 if ent.zero?
        res + ent*Math.log(ent)
      end #if @entropy_coherence.nil?
    end
    @entropy_coherence
  end

  private
  def topic_divergence corpus
    max_kl = 0.0
# Old trick to limit number of iterations
#    num_p = @nbdocs < 6 ? @nbdocs + 5 : @nbdocs 

    semaphore = Mutex.new

    1.upto(20).inject do |k,ntop|
#    1.upto(num_p).inject do |k,ntop|
      lda = Lda::Lda.new corpus
      lda.verbose=false
      lda.num_topics = ntop
      lda.em('random')
      beta_m = lda.beta   # to avoid repeated expensive computation
      vocab  = lda.vocab

      topics_i = Array.new(ntop) { |i| i }

      sum_kl = topics_i.combination(2).inject(0.0) do |kl,topics|
        ti = topics.first
        tj = topics.last
        begin
          kl + 0.upto(vocab.count-1).inject(0.0) do |res,w_i| 
            res + ( Math.exp(beta_m[ti][w_i])*Math.log(Math.exp(beta_m[ti][w_i])/Math.exp(beta_m[tj][w_i])) + Math.exp(beta_m[tj][w_i])*Math.log(Math.exp(beta_m[tj][w_i])/Math.exp(beta_m[ti][w_i])) )
          end
        rescue
          kl + 0.0
        end
      end

      sum_kl /= ntop*(ntop-1)
      sum_kl = max_kl if sum_kl.nan? || sum_kl.infinite? 

      sum_kl <= max_kl ? k : (max_kl = sum_kl and ntop)
    end
  end

  def tmp_top_word_indices(words_per_topic = 10,vocab,beta)
    raise 'No vocabulary loaded.' unless vocab

    # find the highest scoring words per topic
    topics = Hash.new
    indices = (0...vocab.size).to_a

    beta.each_with_index do |topic, topic_num|
      topics[topic_num] = (topic.zip((0...vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
    end

    topics
  end

end