concept_model.rb
5.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/usr/bin/env ruby
require 'lda-ruby'
require 'peach'
class ConceptModel
attr_reader :concepts,:documents,:source,:nbdocs,:nbterms,:query,:total_coherence,:doc_scores,:doc_names,:theta,:entropy_coherence,:avg_coherence
def ConceptModel.parse_hdp str
concepts = []
eval(str).each do |hdp_top|
c = Concept.new
hdp_top.gsub(/topic \d: /,'').split(" + ").each do |words|
ee = words.split('*')
begin
e = ConceptualElement.new ee[1],ee[0].to_f
c << e
rescue ArgumentError
next
end
end
concepts << c
end
concepts
end
def initialize query,source,nb_docs,nb_terms=10,k=false
raise ArgumentError, 'Argument 1 must be a String.' unless query.is_a? String
raise ArgumentError, 'Argument 2 must be a valid Index key.' unless Context::IndexPaths.has_key?(source.to_sym)
@source = source.to_sym
@nbdocs = nb_docs
@nbterms = nb_terms
@query = query
@concepts = []
@total_coherence = 0.0
corpus = Lda::Corpus.new
@documents,@doc_scores,@doc_names = Context.feedback_docs Context::IndexPaths[@source],@query,@nbdocs
@documents.each do |d|
doc = Lda::TextDocument.new corpus,d
corpus.add_document doc
end
if k == false
num_topics = topic_divergence corpus
else
num_topics = k
end
lda = Lda::Lda.new corpus
lda.verbose=false
lda.num_topics = num_topics
lda.em('random')
@beta = lda.beta # to avoid repeated expensive computation
@vocab = lda.vocab #
@theta = lda.compute_topic_document_probability
# Normalizing the phi_t(w) weights for each topic
#
total_prob = {}
tmp_top_word_indices(@nbterms,@vocab,@beta).each_pair do |topic,indices|
total_prob[topic] = indices.inject(0.0) { |res,i| res + Math.exp(@beta[topic][i].to_f) }
end
tmp_top_word_indices(@nbterms,@vocab,@beta).each_pair do |topic,indices|
c = Concept.new
indices.each do |i|
begin
e = ConceptualElement.new @vocab[i],(Math.exp(@beta[topic][i].to_f)/total_prob[topic])
c << e
rescue ArgumentError
next
end
end
c.compute_coherence @doc_scores,@theta,topic
# c.compute_coherence @doc_scores,gamma_m,topic # takes time since it has to compute several probabilities
@concepts << c
@total_coherence += c.coherence
end
end
def to_s
@concepts.collect do |c|
"#{c.coherence/@total_coherence} => [#{c.elements.collect do |e|
"#{e.prob} #{e.word}"
end.join(', ')
}]"
end.join "\n"
end
def to_indriq
"#weight( #{@concepts.collect do |c|
"#{c.coherence/@total_coherence} #weight ( #{c.elements.collect do |e|
"#{e.prob} #{e.word}"
end.join(' ')
} ) "
end.join " "} )"
end
def <<(concept)
raise ArgumentError, 'Argument must be a Concept.' unless elem.is_a? Concept
@concepts << concept
end
def avg_model_coherence
if @documents.empty?
@avg_coherence = 0.0
else
@avg_coherence = @concepts.inject(0.0) { |res,c| res + c.uci_coherence }/@concepts.count #if @avg_coherence.nil?
end
@avg_coherence
end
def entropy_model_coherence
if @documents.empty?
@entropy_coherence = 0.0
else
@entropy_coherence = @concepts.inject(0.0) do |res,c|
ent = c.uci_coherence_entropy
ent += 0.0000000000000000000000001 if ent.zero?
res + ent*Math.log(ent)
end #if @entropy_coherence.nil?
end
@entropy_coherence
end
private
def topic_divergence corpus
max_kl = 0.0
# Old trick to limit number of iterations
# num_p = @nbdocs < 6 ? @nbdocs + 5 : @nbdocs
semaphore = Mutex.new
1.upto(20).inject do |k,ntop|
# 1.upto(num_p).inject do |k,ntop|
lda = Lda::Lda.new corpus
lda.verbose=false
lda.num_topics = ntop
lda.em('random')
beta_m = lda.beta # to avoid repeated expensive computation
vocab = lda.vocab
topics_i = Array.new(ntop) { |i| i }
sum_kl = topics_i.combination(2).inject(0.0) do |kl,topics|
ti = topics.first
tj = topics.last
begin
kl + 0.upto(vocab.count-1).inject(0.0) do |res,w_i|
res + ( Math.exp(beta_m[ti][w_i])*Math.log(Math.exp(beta_m[ti][w_i])/Math.exp(beta_m[tj][w_i])) + Math.exp(beta_m[tj][w_i])*Math.log(Math.exp(beta_m[tj][w_i])/Math.exp(beta_m[ti][w_i])) )
end
rescue
kl + 0.0
end
end
sum_kl /= ntop*(ntop-1)
sum_kl = max_kl if sum_kl.nan? || sum_kl.infinite?
sum_kl <= max_kl ? k : (max_kl = sum_kl and ntop)
end
end
def tmp_top_word_indices(words_per_topic = 10,vocab,beta)
raise 'No vocabulary loaded.' unless vocab
# find the highest scoring words per topic
topics = Hash.new
indices = (0...vocab.size).to_a
beta.each_with_index do |topic, topic_num|
topics[topic_num] = (topic.zip((0...vocab.size).to_a).sort { |i, j| i[0] <=> j[0] }.map { |i, j| j }.reverse)[0...words_per_topic]
end
topics
end
end