Commit b3c02139759e76987368a3d947c49f8c6641ce21
1 parent
e0e33fca06
Exists in
master
faster computing of successive calls to ngrams(). is_stopword? is now effective …
…for multiword expressions. sdm fix.
Showing 3 changed files with 13 additions and 8 deletions Side-by-side Diff
lib/mirimiri/document.rb
... | ... | @@ -50,15 +50,18 @@ |
50 | 50 | window = [] |
51 | 51 | ngrams_array = [] |
52 | 52 | |
53 | - @words.each do |w| | |
54 | - window.push(w) | |
55 | - if window.size == n | |
56 | - ngrams_array.push window.join(" ") | |
57 | - window.delete_at(0) | |
53 | + if @ngrams[n].nil? | |
54 | + @words.each do |w| | |
55 | + window.push(w) | |
56 | + if window.size == n | |
57 | + ngrams_array.push window.join(" ") | |
58 | + window.delete_at(0) | |
59 | + end | |
58 | 60 | end |
61 | + @ngrams[n] = ngrams_array | |
59 | 62 | end |
60 | 63 | |
61 | - ngrams_array | |
64 | + @ngrams[n] | |
62 | 65 | end |
63 | 66 | |
64 | 67 | # Returns a Hash containing the words and their associated counts in the current Document. |
... | ... | @@ -122,6 +125,7 @@ |
122 | 125 | @doc_content = content |
123 | 126 | @words = format_words |
124 | 127 | @count_words = count_words |
128 | + @ngrams = {} | |
125 | 129 | end |
126 | 130 | |
127 | 131 | protected :format_words, :count_words |
lib/mirimiri/string.rb
... | ... | @@ -163,7 +163,7 @@ |
163 | 163 | |
164 | 164 | # Returns +true+ if +self+ belongs to Mirimiri::Stoplist, +false+ otherwise. |
165 | 165 | def is_stopword? |
166 | - Stoplist.include?(self.downcase) | |
166 | + self.split.all? { |e| Stoplist.include?(e.downcase) } | |
167 | 167 | end |
168 | 168 | |
169 | 169 | def sequential_dependence_model t=0.85,o=0.10,u=0.05,field=nil |
... | ... | @@ -187,7 +187,7 @@ |
187 | 187 | end |
188 | 188 | end |
189 | 189 | |
190 | - "#weight ( #{t} #combine( #{self} ) #{o} #combine ( #{ematch.join(" ")} ) #{u} #combine ( #{pmatch.join(" ")} ) )" | |
190 | + "#weight ( #{t} #combine( #{d.words.join(" ")} ) #{o} #combine ( #{ematch.join(" ")} ) #{u} #combine ( #{pmatch.join(" ")} ) )" | |
191 | 191 | end |
192 | 192 | |
193 | 193 | # Do not use. |
main.rb