Commit b3c02139759e76987368a3d947c49f8c6641ce21

Authored by Romain Deveaud
1 parent e0e33fca06
Exists in master

faster computing of successive calls to ngrams(). is_stopword? is now effective …

…for multiword expressions. sdm fix.

Showing 3 changed files with 13 additions and 8 deletions Side-by-side Diff

lib/mirimiri/document.rb
... ... @@ -50,15 +50,18 @@
50 50 window = []
51 51 ngrams_array = []
52 52  
53   - @words.each do |w|
54   - window.push(w)
55   - if window.size == n
56   - ngrams_array.push window.join(" ")
57   - window.delete_at(0)
  53 + if @ngrams[n].nil?
  54 + @words.each do |w|
  55 + window.push(w)
  56 + if window.size == n
  57 + ngrams_array.push window.join(" ")
  58 + window.delete_at(0)
  59 + end
58 60 end
  61 + @ngrams[n] = ngrams_array
59 62 end
60 63  
61   - ngrams_array
  64 + @ngrams[n]
62 65 end
63 66  
64 67 # Returns a Hash containing the words and their associated counts in the current Document.
... ... @@ -122,6 +125,7 @@
122 125 @doc_content = content
123 126 @words = format_words
124 127 @count_words = count_words
  128 + @ngrams = {}
125 129 end
126 130  
127 131 protected :format_words, :count_words
lib/mirimiri/string.rb
... ... @@ -163,7 +163,7 @@
163 163  
164 164 # Returns +true+ if +self+ belongs to Mirimiri::Stoplist, +false+ otherwise.
165 165 def is_stopword?
166   - Stoplist.include?(self.downcase)
  166 + self.split.all? { |e| Stoplist.include?(e.downcase) }
167 167 end
168 168  
169 169 def sequential_dependence_model t=0.85,o=0.10,u=0.05,field=nil
... ... @@ -187,7 +187,7 @@
187 187 end
188 188 end
189 189  
190   - "#weight ( #{t} #combine( #{self} ) #{o} #combine ( #{ematch.join(" ")} ) #{u} #combine ( #{pmatch.join(" ")} ) )"
  190 + "#weight ( #{t} #combine( #{d.words.join(" ")} ) #{o} #combine ( #{ematch.join(" ")} ) #{u} #combine ( #{pmatch.join(" ")} ) )"
191 191 end
192 192  
193 193 # Do not use.
1 1 $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib"))
2 2  
3 3 require 'mirimiri'
  4 +require "benchmark"
4 5  
5 6 w = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan")
6 7 p w.entropy("dillinger escape plan")