Blame view

lib/context/concept.rb 3.8 KB
65040e3e6   Romain Deveaud   changes made for ...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
  #!/usr/bin/env ruby
  
  class Concept
    attr_reader :elements, :coherence
    
    def initialize
      @elements = []
      @coherence = 0
    end
  
    def <<(elem)
      raise ArgumentError, 'Argument must be a ConceptualElement.' unless elem.is_a? ConceptualElement
      @elements << elem
    end
  
    def compute_coherence scores,theta,k#arg=nil
  #    update_feedback_coherence arg
      @coherence = 0.upto(theta.count-1).inject(0.0) do |sum,i|
        sum + Math.exp(theta[i][k])*Math.exp(scores[i].to_f)
      end
    end
  
    def score_label label,index_path=Context::IndexPaths[:wiki_en2012]
      s = @elements.inject(0.0) do |res,e|
  #      *self.prob_w(index_path,"#{w[:word]} #uw10(#{label})")
        res + e.prob*Math.log(Context.prob_w(index_path,"#{e.word} #uw10(#{label})")/(e.p_in_coll*Context.prob_w(index_path,label)))
      end
  
      s
    end
  
    def concept_words_similarity s,index_path=Context::IndexPaths[:wiki_en]
  #    inter = @elements.collect { |w| w unless (w.word & s).empty? }
      inter = self.words & s.words
  
      sim = (inter.count/self.words.count.to_f) * inter.inject(0.0) { |sum,w| sum + Math.log(Context.df index_path,w) }
      sim
    end
  
    def weighted_concept_similarity s,index_path=Context::IndexPaths[:wiki_en]
      inter = self.words & s.words
      sim = (inter.count/self.words.count.to_f)
      sim *= @elements.inject(0.0) do |sum,w|
        wp = s.get_element_from_word w.word
  
        s.words.include?(w.word) ? sum + wp.prob*w.prob*Math.log(Context.df index_path,w.word) : sum + 0.0
      end
      sim
    end
  
    def get_element_from_word w
      @elements.select { |e| e.word == w }.first
    end
  
    def words
      @elements.collect { |w| w.word }
    end
  
    def word_probs
      res = {}
      @elements.each { |w| res[w.word] = w.prob }
      res
    end
  
  # From papers :
  #  NAACL'10: `Automatic Evaluation of Topic Coherence`
  #  EMNLP'12: `Exploring Topic Coherence over many models and many topics`
  #
    def uci_coherence epsilon=1,index_path=Context::IndexPaths[:wiki_en]
      coherence = @elements.combination(2).inject(0.0) do |res,bigram|
  #Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)*
        t = (Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)+epsilon)/((bigram.first.p_in_coll)*(bigram.last.p_in_coll))
        res + Math.log(t)
      end
  
      coherence /= @elements.count*(@elements.count-1)
      coherence
    end
  
    def uci_coherence_entropy index_path=Context::IndexPaths[:wiki_en]
      coherence = @elements.combination(2).inject(0.0) do |res,bigram|
  #Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20)*
        t = (Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}",20))/((bigram.first.p_in_coll)*(bigram.last.p_in_coll))
        res + t
      end
  
      coherence /= @elements.count*(@elements.count-1)
      coherence
    end
  
    protected
    def update_coherence index_path=Context::IndexPaths[:wiki_en]
      coherence = @elements.combination(2).inject(0.0) do |res,bigram|
        res + Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}")*Math.log(Context.prob_w(index_path,"#{bigram.first.word} #{bigram.last.word}")/((bigram.first.p_in_coll index_path)*(bigram.last.p_in_coll index_path)))
      end
  
      coherence /= @elements.count*(@elements.count-1)
      @coherence = coherence
    end
  
    def update_feedback_coherence documents 
      corpus = Mirimiri::Document.new documents.join " "
  
      windows = corpus.ngrams(10).collect { |w| w.split }
  
      coherence = @elements.combination(2).inject(0.0) do |res,bigram|
        big_prob = windows.count{ |c| c.include?(bigram.first.word) && c.include?(bigram.last.word) }.to_f/windows.count
        mi = big_prob.zero? ? 0.0 : big_prob*bigram.first.prob*bigram.last.prob*Math.log(big_prob/(corpus.tf(bigram.first.word)*corpus.tf(bigram.last.word)))
        res + mi
      end 
  
      coherence /= @elements.count*(@elements.count-1)
      @coherence = coherence
    end
  
  end