concept_model.rb 5.02 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187


#!/usr/bin/env ruby

require 'lda-ruby'
require 'peach'

class ConceptModel
  attr_reader :concepts,:documents,:source,:nbdocs,:nbterms,:query,:total_coherence,:doc_scores,:doc_names,:theta,:entropy_coherence,:avg_coherence

  def ConceptModel.parse_hdp str
    concepts = []
    eval(str).each do |hdp_top|
      c = Concept.new
      hdp_top.gsub(/topic \d: /,'').split(" + ").each do |words|
        ee = words.split('*') 
        begin
          e = ConceptualElement.new ee[1],ee[0].to_f
          c << e 
        rescue ArgumentError
          next
        end
      end

      concepts << c
    end
    concepts
  end

  def initialize query,source,nb_docs,nb_terms=10,k=false
    raise ArgumentError, 'Argument 1 must be a String.' unless query.is_a? String
    raise ArgumentError, 'Argument 2 must be a valid Index key.' unless Context::IndexPaths.has_key?(source.to_sym)

    @source  = source.to_sym
    @nbdocs  = nb_docs
    @nbterms = nb_terms
    @query   = query
    @concepts = []
    @total_coherence = 0.0

    corpus = Lda::Corpus.new

    @documents,@doc_scores,@doc_names = Context.feedback_docs Context::IndexPaths[@source],@query,@nbdocs
    @documents.each do |d|
      doc = Lda::TextDocument.new corpus,d
      corpus.add_document doc
    end

    if k == false
      num_topics = topic_divergence corpus
    else
      num_topics = k
    end

    lda = Lda::Lda.new corpus
    lda.verbose=false
    lda.num_topics = num_topics

    lda.em('random')

    @beta  = lda.beta   # to avoid repeated expensive computation
    @vocab  = lda.vocab  #

    @theta = lda.compute_topic_document_probability

# Normalizing the phi_t(w) weights for each topic
#
    total_prob = {}
    tmp_top_word_indices(@nbterms,@vocab,@beta).each_pair do |topic,indices|
      total_prob[topic] = indices.inject(0.0) { |res,i| res + Math.exp(@beta[topic][i].to_f) }
    end

    tmp_top_word_indices(@nbterms,@vocab,@beta).each_pair do |topic,indices|
      c = Concept.new
      indices.each do |i| 
        begin
          e = ConceptualElement.new @vocab[i],(Math.exp(@beta[topic][i].to_f)/total_prob[topic])
          c << e
        rescue ArgumentError
          next
        end
      end

      c.compute_coherence @doc_scores,@theta,topic

#      c.compute_coherence @doc_scores,gamma_m,topic # takes time since it has to compute several probabilities
      @concepts << c
      @total_coherence += c.coherence
    end
  end

  def to_s
    @concepts.collect do |c|
      "#{c.coherence/@total_coherence} => [#{c.elements.collect do |e|
        "#{e.prob} #{e.word}"
      end.join(', ')
      }]"
    end.join "\n"
  end

  def to_indriq
    "#weight( #{@concepts.collect do |c|
      "#{c.coherence/@total_coherence} #weight ( #{c.elements.collect do |e|
        "#{e.prob} #{e.word}"
      end.join(' ')
      } ) "
    end.join " "} )"
  end

  def <<(concept)
    raise ArgumentError, 'Argument must be a Concept.' unless elem.is_a? Concept
    @concepts << concept
  end

  def avg_model_coherence
    if @documents.empty?
      @avg_coherence = 0.0 
    else
      @avg_coherence = @concepts.inject(0.0) { |res,c| res + c.uci_coherence }/@concepts.count #if @avg_coherence.nil?
    end
    @avg_coherence
  end

  def entropy_model_coherence
    if @documents.empty?
      @entropy_coherence = 0.0 
    else  
      @entropy_coherence = @concepts.inject(0.0) do |res,c| 
        ent = c.uci_coherence_entropy
        ent += 0.0000000000000000000000001 if ent.zero?
        res + ent*Math.log(ent)
      end #if @entropy_coherence.nil?
    end
    @entropy_coherence
  end

  private
  def topic_divergence corpus
    max_kl = 0.0
# Old trick to limit number of iterations
#    num_p = @nbdocs < 6 ? @nbdocs + 5 : @nbdocs 

    semaphore = Mutex.new

    1.upto(20).inject do |k,ntop|
#    1.upto(num_p).inject do |k,ntop|
      lda = Lda::Lda.new corpus
      lda.verbose=false
      lda.num_topics = ntop
      lda.em('random')
      beta_m = lda.beta   # to avoid repeated expensive computation
      vocab  = lda.vocab

      topics_i = Array.new(ntop) { |i| i }

      sum_kl = topics_i.combination(2).inject(0.0) do |kl,topics|
        ti = topics.first
        tj = topics.last
        begin
          kl + 0.upto(vocab.count-1).inject(0.0) do |res,w_i| 
            res + ( Math.exp(beta_m[ti][w_i])*Math.log(Math.exp(beta_m[ti][w_i])/Math.exp(beta_m[tj][w_i])) + Math.exp(beta_m[tj][w_i])*Math.log(Math.exp(beta_m[tj][w_i])/Math.exp(beta_m[ti][w_i])) )
          end
        rescue
          kl + 0.0
        end
      end

      sum_kl /= ntop*(ntop-1)
      sum_kl = max_kl if sum_kl.nan? || sum_kl.infinite? 

      sum_kl <= max_kl ? k : (max_kl = sum_kl and ntop)
    end
  end

  def tmp_top_word_indices(words_per_topic = 10,vocab,beta)
    raise 'No vocabulary loaded.' unless vocab

    # find the highest scoring words per topic
    topics = Hash.new
    indices = (0...vocab.size).to_a

    beta.each_with_index do |topic, topic_num|
      topics[topic_num] = (topic.zip((0...vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
    end

    topics
  end

end