Commit b3c02139759e76987368a3d947c49f8c6641ce21
1 parent
e0e33fca06
Exists in
master
faster computing of successive calls to ngrams(). is_stopword? is now effective …
…for multiword expressions. sdm fix.
Showing 3 changed files with 13 additions and 8 deletions Side-by-side Diff
lib/mirimiri/document.rb
| ... | ... | @@ -50,15 +50,18 @@ |
| 50 | 50 | window = [] |
| 51 | 51 | ngrams_array = [] |
| 52 | 52 | |
| 53 | - @words.each do |w| | |
| 54 | - window.push(w) | |
| 55 | - if window.size == n | |
| 56 | - ngrams_array.push window.join(" ") | |
| 57 | - window.delete_at(0) | |
| 53 | + if @ngrams[n].nil? | |
| 54 | + @words.each do |w| | |
| 55 | + window.push(w) | |
| 56 | + if window.size == n | |
| 57 | + ngrams_array.push window.join(" ") | |
| 58 | + window.delete_at(0) | |
| 59 | + end | |
| 58 | 60 | end |
| 61 | + @ngrams[n] = ngrams_array | |
| 59 | 62 | end |
| 60 | 63 | |
| 61 | - ngrams_array | |
| 64 | + @ngrams[n] | |
| 62 | 65 | end |
| 63 | 66 | |
| 64 | 67 | # Returns a Hash containing the words and their associated counts in the current Document. |
| ... | ... | @@ -122,6 +125,7 @@ |
| 122 | 125 | @doc_content = content |
| 123 | 126 | @words = format_words |
| 124 | 127 | @count_words = count_words |
| 128 | + @ngrams = {} | |
| 125 | 129 | end |
| 126 | 130 | |
| 127 | 131 | protected :format_words, :count_words |
lib/mirimiri/string.rb
| ... | ... | @@ -163,7 +163,7 @@ |
| 163 | 163 | |
| 164 | 164 | # Returns +true+ if +self+ belongs to Mirimiri::Stoplist, +false+ otherwise. |
| 165 | 165 | def is_stopword? |
| 166 | - Stoplist.include?(self.downcase) | |
| 166 | + self.split.all? { |e| Stoplist.include?(e.downcase) } | |
| 167 | 167 | end |
| 168 | 168 | |
| 169 | 169 | def sequential_dependence_model t=0.85,o=0.10,u=0.05,field=nil |
| ... | ... | @@ -187,7 +187,7 @@ |
| 187 | 187 | end |
| 188 | 188 | end |
| 189 | 189 | |
| 190 | - "#weight ( #{t} #combine( #{self} ) #{o} #combine ( #{ematch.join(" ")} ) #{u} #combine ( #{pmatch.join(" ")} ) )" | |
| 190 | + "#weight ( #{t} #combine( #{d.words.join(" ")} ) #{o} #combine ( #{ematch.join(" ")} ) #{u} #combine ( #{pmatch.join(" ")} ) )" | |
| 191 | 191 | end |
| 192 | 192 | |
| 193 | 193 | # Do not use. |
main.rb