Blame view

lib/context/concept_model.rb 5.02 KB
65040e3e6   Romain Deveaud   changes made for ...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
  #!/usr/bin/env ruby
  
  require 'lda-ruby'
  require 'peach'
  
  class ConceptModel
    attr_reader :concepts,:documents,:source,:nbdocs,:nbterms,:query,:total_coherence,:doc_scores,:doc_names,:theta,:entropy_coherence,:avg_coherence
  
    def ConceptModel.parse_hdp str
      concepts = []
      eval(str).each do |hdp_top|
        c = Concept.new
        hdp_top.gsub(/topic \d: /,'').split(" + ").each do |words|
          ee = words.split('*') 
          begin
            e = ConceptualElement.new ee[1],ee[0].to_f
            c << e 
          rescue ArgumentError
            next
          end
        end
  
        concepts << c
      end
      concepts
    end
  
    def initialize query,source,nb_docs,nb_terms=10,k=false
      raise ArgumentError, 'Argument 1 must be a String.' unless query.is_a? String
      raise ArgumentError, 'Argument 2 must be a valid Index key.' unless Context::IndexPaths.has_key?(source.to_sym)
  
      @source  = source.to_sym
      @nbdocs  = nb_docs
      @nbterms = nb_terms
      @query   = query
      @concepts = []
      @total_coherence = 0.0
  
      corpus = Lda::Corpus.new
  
      @documents,@doc_scores,@doc_names = Context.feedback_docs Context::IndexPaths[@source],@query,@nbdocs
      @documents.each do |d|
        doc = Lda::TextDocument.new corpus,d
        corpus.add_document doc
      end
  
      if k == false
        num_topics = topic_divergence corpus
      else
        num_topics = k
      end
  
      lda = Lda::Lda.new corpus
      lda.verbose=false
      lda.num_topics = num_topics
  
      lda.em('random')
  
      @beta  = lda.beta   # to avoid repeated expensive computation
      @vocab  = lda.vocab  #
  
      @theta = lda.compute_topic_document_probability
  
  # Normalizing the phi_t(w) weights for each topic
  #
      total_prob = {}
      tmp_top_word_indices(@nbterms,@vocab,@beta).each_pair do |topic,indices|
        total_prob[topic] = indices.inject(0.0) { |res,i| res + Math.exp(@beta[topic][i].to_f) }
      end
  
      tmp_top_word_indices(@nbterms,@vocab,@beta).each_pair do |topic,indices|
        c = Concept.new
        indices.each do |i| 
          begin
            e = ConceptualElement.new @vocab[i],(Math.exp(@beta[topic][i].to_f)/total_prob[topic])
            c << e
          rescue ArgumentError
            next
          end
        end
  
        c.compute_coherence @doc_scores,@theta,topic
  
  #      c.compute_coherence @doc_scores,gamma_m,topic # takes time since it has to compute several probabilities
        @concepts << c
        @total_coherence += c.coherence
      end
    end
  
    def to_s
      @concepts.collect do |c|
        "#{c.coherence/@total_coherence} => [#{c.elements.collect do |e|
          "#{e.prob} #{e.word}"
        end.join(', ')
        }]"
      end.join "
  "
    end
  
    def to_indriq
      "#weight( #{@concepts.collect do |c|
        "#{c.coherence/@total_coherence} #weight ( #{c.elements.collect do |e|
          "#{e.prob} #{e.word}"
        end.join(' ')
        } ) "
      end.join " "} )"
    end
  
    def <<(concept)
      raise ArgumentError, 'Argument must be a Concept.' unless elem.is_a? Concept
      @concepts << concept
    end
  
    def avg_model_coherence
      if @documents.empty?
        @avg_coherence = 0.0 
      else
        @avg_coherence = @concepts.inject(0.0) { |res,c| res + c.uci_coherence }/@concepts.count #if @avg_coherence.nil?
      end
      @avg_coherence
    end
  
    def entropy_model_coherence
      if @documents.empty?
        @entropy_coherence = 0.0 
      else  
        @entropy_coherence = @concepts.inject(0.0) do |res,c| 
          ent = c.uci_coherence_entropy
          ent += 0.0000000000000000000000001 if ent.zero?
          res + ent*Math.log(ent)
        end #if @entropy_coherence.nil?
      end
      @entropy_coherence
    end
  
    private
    def topic_divergence corpus
      max_kl = 0.0
  # Old trick to limit number of iterations
  #    num_p = @nbdocs < 6 ? @nbdocs + 5 : @nbdocs 
  
      semaphore = Mutex.new
  
      1.upto(20).inject do |k,ntop|
  #    1.upto(num_p).inject do |k,ntop|
        lda = Lda::Lda.new corpus
        lda.verbose=false
        lda.num_topics = ntop
        lda.em('random')
        beta_m = lda.beta   # to avoid repeated expensive computation
        vocab  = lda.vocab
  
        topics_i = Array.new(ntop) { |i| i }
  
        sum_kl = topics_i.combination(2).inject(0.0) do |kl,topics|
          ti = topics.first
          tj = topics.last
          begin
            kl + 0.upto(vocab.count-1).inject(0.0) do |res,w_i| 
              res + ( Math.exp(beta_m[ti][w_i])*Math.log(Math.exp(beta_m[ti][w_i])/Math.exp(beta_m[tj][w_i])) + Math.exp(beta_m[tj][w_i])*Math.log(Math.exp(beta_m[tj][w_i])/Math.exp(beta_m[ti][w_i])) )
            end
          rescue
            kl + 0.0
          end
        end
  
        sum_kl /= ntop*(ntop-1)
        sum_kl = max_kl if sum_kl.nan? || sum_kl.infinite? 
  
        sum_kl <= max_kl ? k : (max_kl = sum_kl and ntop)
      end
    end
  
    def tmp_top_word_indices(words_per_topic = 10,vocab,beta)
      raise 'No vocabulary loaded.' unless vocab
  
      # find the highest scoring words per topic
      topics = Hash.new
      indices = (0...vocab.size).to_a
  
      beta.each_with_index do |topic, topic_num|
        topics[topic_num] = (topic.zip((0...vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
      end
  
      topics
    end
  
  end