Blame view
lib/context/concept.rb
3.8 KB
65040e3e6 changes made for ... |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
#!/usr/bin/env ruby class Concept attr_reader :elements, :coherence def initialize @elements = [] @coherence = 0 end def <<(elem) raise ArgumentError, 'Argument must be a ConceptualElement.' unless elem.is_a? ConceptualElement @elements << elem end def compute_coherence scores,theta,k#arg=nil # update_feedback_coherence arg @coherence = 0.upto(theta.count-1).inject(0.0) do |sum,i| sum + Math.exp(theta[i][k])*Math.exp(scores[i].to_f) end end def score_label label,index_path=Context::IndexPaths[:wiki_en2012] s = @elements.inject(0.0) do |res,e| # *self.prob_w(index_path,"#{w[:word]} #uw10(#{label})") res + e.prob*Math.log(Context.prob_w(index_path,"#{e.word} #uw10(#{label})")/(e.p_in_coll*Context.prob_w(index_path,label))) end s end def concept_words_similarity s,index_path=Context::IndexPaths[:wiki_en] # inter = @elements.collect { |w| w unless (w.word & s).empty? } inter = self.words & s.words sim = (inter.count/self.words.count.to_f) * inter.inject(0.0) { |sum,w| sum + Math.log(Context.df index_path,w) } sim end def weighted_concept_similarity s,index_path=Context::IndexPaths[:wiki_en] inter = self.words & s.words sim = (inter.count/self.words.count.to_f) sim *= @elements.inject(0.0) do |sum,w| wp = s.get_element_from_word w.word s.words.include?(w.word) ? sum + wp.prob*w.prob*Math.log(Context.df index_path,w.word) : sum + 0.0 end sim end def get_element_from_word w @elements.select { |e| e.word == w }.first end def words @elements.collect { |w| w.word } end def word_probs res = {} @elements.each { |w| res[w.word] = w.prob } res end # From papers : # NAACL'10: `Automatic Evaluation of Topic Coherence` # EMNLP'12: `Exploring Topic Coherence over many models and many topics` # def uci_coherence epsilon=1,index_path=Context::IndexPaths[:wiki_en] coherence = @elements.combination(2).inject(0.0) do |res,bigram| #Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)* t = (Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)+epsilon)/((bigram.first.p_in_coll)*(bigram.last.p_in_coll)) res + Math.log(t) end coherence /= @elements.count*(@elements.count-1) coherence end def uci_coherence_entropy index_path=Context::IndexPaths[:wiki_en] coherence = @elements.combination(2).inject(0.0) do |res,bigram| #Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)* t = (Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20))/((bigram.first.p_in_coll)*(bigram.last.p_in_coll)) res + t end coherence /= @elements.count*(@elements.count-1) coherence end protected def update_coherence index_path=Context::IndexPaths[:wiki_en] coherence = @elements.combination(2).inject(0.0) do |res,bigram| res + Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}")*Math.log(Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}")/((bigram.first.p_in_coll index_path)*(bigram.last.p_in_coll index_path))) end coherence /= @elements.count*(@elements.count-1) @coherence = coherence end def update_feedback_coherence documents corpus = Mirimiri::Document.new documents.join " " windows = corpus.ngrams(10).collect { |w| w.split } coherence = @elements.combination(2).inject(0.0) do |res,bigram| big_prob = windows.count{ |c| c.include?(bigram.first.word) && c.include?(bigram.last.word) }.to_f/windows.count mi = big_prob.zero? ? 0.0 : big_prob*bigram.first.prob*bigram.last.prob*Math.log(big_prob/(corpus.tf(bigram.first.word)*corpus.tf(bigram.last.word))) res + mi end coherence /= @elements.count*(@elements.count-1) @coherence = coherence end end |