Commit 845768f8ac5a1593db356377fcc68208c12efa74
1 parent
175908fe2a
Exists in
master
creating a group of indri queries is possible. added an accent stripping function.
Showing 4 changed files with 116 additions and 17 deletions Inline Diff
examples/entropy.rb
1 | require 'rir' | 1 | require 'mirimiri' |
2 | 2 | ||
3 | # Concatenates all lines from one file, without \n | 3 | # Concatenates all lines from one file, without \n |
4 | readme = File.open('README.markdown').readlines.collect { |l| l.chomp }.join(" ") | 4 | readme = File.open('README.markdown').readlines.collect { |l| l.chomp }.join(" ") |
5 | 5 | ||
6 | # Creates the document with a string | 6 | # Creates the document with a string |
7 | doc = RIR::Document.new readme | 7 | doc = Mirimiri::Document.new readme |
8 | 8 | ||
9 | # Outputs all the unique words of the document with their entropy scores | 9 | # Outputs all the unique words of the document with their entropy scores |
10 | p doc.words.collect { |w| "#{w} => #{doc.entropy w}" } | 10 | p doc.words.collect { |w| "#{w} => #{doc.entropy w}" } |
11 | 11 |
lib/mirimiri/document.rb
1 | #!/usr/bin/env ruby | 1 | #!/usr/bin/env ruby |
2 | 2 | ||
3 | #-- | 3 | #-- |
4 | # This file is a part of the mirimiri library | 4 | # This file is a part of the mirimiri library |
5 | # | 5 | # |
6 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | 6 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> |
7 | # | 7 | # |
8 | # This program is free software: you can redistribute it and/or modify | 8 | # This program is free software: you can redistribute it and/or modify |
9 | # it under the terms of the GNU General Public License as published by | 9 | # it under the terms of the GNU General Public License as published by |
10 | # the Free Software Foundation, either version 3 of the License, or | 10 | # the Free Software Foundation, either version 3 of the License, or |
11 | # (at your option) any later version. | 11 | # (at your option) any later version. |
12 | # | 12 | # |
13 | # This program is distributed in the hope that it will be useful, | 13 | # This program is distributed in the hope that it will be useful, |
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | # GNU General Public License for more details. | 16 | # GNU General Public License for more details. |
17 | # | 17 | # |
18 | # You should have received a copy of the GNU General Public License | 18 | # You should have received a copy of the GNU General Public License |
19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | 19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
20 | #++ | 20 | #++ |
21 | 21 | ||
22 | 22 | ||
23 | # General module | 23 | # General module |
24 | module Mirimiri | 24 | module Mirimiri |
25 | 25 | ||
26 | # A Document is a bag of words and is constructed from a string. | 26 | # A Document is a bag of words and is constructed from a string. |
27 | class Document | 27 | class Document |
28 | attr_reader :words, :doc_content | 28 | attr_reader :words, :doc_content |
29 | 29 | ||
30 | # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html | 30 | # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html |
31 | # and the \\W special escape). | 31 | # and the \\W special escape). |
32 | # | 32 | # |
33 | # Protected function, only meant to by called at the initialization. | 33 | # Protected function, only meant to by called at the initialization. |
34 | def format_words | 34 | def format_words |
35 | wo = [] | 35 | wo = [] |
36 | 36 | ||
37 | @doc_content.split.each do |w| | 37 | @doc_content.split.each do |w| |
38 | w.split(/\W/).each do |sw| | 38 | w.split(/\W/).each do |sw| |
39 | wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ | 39 | wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ |
40 | end | 40 | end |
41 | end | 41 | end |
42 | 42 | ||
43 | wo | 43 | wo |
44 | end | 44 | end |
45 | 45 | ||
46 | # Returns an Array containing the +n+-grams (words) from the current Document. | 46 | # Returns an Array containing the +n+-grams (words) from the current Document. |
47 | # | 47 | # |
48 | # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] | 48 | # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] |
49 | def ngrams(n) | 49 | def ngrams(n) |
50 | window = [] | 50 | window = [] |
51 | ngrams_array = [] | 51 | ngrams_array = [] |
52 | 52 | ||
53 | @words.each do |w| | 53 | @words.each do |w| |
54 | window.push(w) | 54 | window.push(w) |
55 | if window.size == n | 55 | if window.size == n |
56 | ngrams_array.push window.join(" ") | 56 | ngrams_array.push window.join(" ") |
57 | window.delete_at(0) | 57 | window.delete_at(0) |
58 | end | 58 | end |
59 | end | 59 | end |
60 | 60 | ||
61 | ngrams_array.uniq | 61 | ngrams_array.uniq |
62 | end | 62 | end |
63 | 63 | ||
64 | # Returns a Hash containing the words and their associated counts in the current Document. | 64 | # Returns a Hash containing the words and their associated counts in the current Document. |
65 | # | 65 | # |
66 | # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } | 66 | # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } |
67 | def count_words | 67 | def count_words |
68 | counts = Hash.new { |h,k| h[k] = 0 } | 68 | counts = Hash.new { |h,k| h[k] = 0 } |
69 | @words.each { |w| counts[w] += 1 } | 69 | @words.each { |w| counts[w] += 1 } |
70 | 70 | ||
71 | counts | 71 | counts |
72 | end | 72 | end |
73 | 73 | ||
74 | # Computes the entropy of a given string +s+ inside the document. | 74 | # Computes the entropy of a given string +s+ inside the document. |
75 | # | 75 | # |
76 | # If the string parameter is composed of many words (i.e. tokens separated | 76 | # If the string parameter is composed of many words (i.e. tokens separated |
77 | # by whitespace(s)), it is considered as an ngram. | 77 | # by whitespace(s)), it is considered as an ngram. |
78 | # | 78 | # |
79 | # entropy("guitar") #=> 0.00432114812727959 | 79 | # entropy("guitar") #=> 0.00432114812727959 |
80 | # entropy("dillinger escape plan") #=> 0.265862076325102 | 80 | # entropy("dillinger escape plan") #=> 0.265862076325102 |
81 | def entropy(s) | 81 | def entropy(s) |
82 | en = 0.0 | 82 | en = 0.0 |
83 | counts = self.count_words | 83 | counts = self.count_words |
84 | 84 | ||
85 | s.split.each do |w| | 85 | s.split.each do |w| |
86 | p_wi = counts[w].to_f/@words.count.to_f | 86 | p_wi = counts[w].to_f/@words.count.to_f |
87 | en += p_wi*Math.log2(p_wi) | 87 | en += p_wi*Math.log2(p_wi) |
88 | end | 88 | end |
89 | 89 | ||
90 | en *= -1 | 90 | en *= -1 |
91 | en | 91 | en |
92 | end | 92 | end |
93 | 93 | ||
94 | # Computes the term frequency of a given *word* +s+. | 94 | # Computes the term frequency of a given *word* +s+. |
95 | # | 95 | # |
96 | # tf("guitar") #=> 0.000380372765310004 | 96 | # tf("guitar") #=> 0.000380372765310004 |
97 | def tf(s) | 97 | def tf(s) |
98 | self.count_words[s].to_f/@words.size.to_f | 98 | self.count_words[s].to_f/@words.size.to_f |
99 | end | 99 | end |
100 | 100 | ||
101 | 101 | ||
102 | def initialize(content) | 102 | def initialize(content="") |
103 | @doc_content = content | 103 | @doc_content = content |
104 | @words = format_words | 104 | @words = format_words |
105 | end | 105 | end |
106 | 106 | ||
107 | protected :format_words | 107 | protected :format_words |
108 | end | 108 | end |
109 | 109 | ||
110 | # A WebDocument is a Document with a +url+. | 110 | # A WebDocument is a Document with a +url+. |
111 | class WebDocument < Document | 111 | class WebDocument < Document |
112 | attr_reader :url | 112 | attr_reader :url |
113 | 113 | ||
114 | # Returns the HTML text from the page of a given +url+. | 114 | # Returns the HTML text from the page of a given +url+. |
115 | def self.get_content(url) | 115 | def self.get_content(url) |
116 | require 'net/http' | 116 | require 'net/http' |
117 | Net::HTTP.get(URI.parse(url)) | 117 | Net::HTTP.get(URI.parse(url)) |
118 | end | 118 | end |
119 | 119 | ||
120 | # WebDocument constructor, the content of the Document is the HTML page | 120 | # WebDocument constructor, the content of the Document is the HTML page |
121 | # without the tags. | 121 | # without the tags. |
122 | def initialize(url) | 122 | def initialize(url) |
123 | @url = url | 123 | @url = url |
124 | super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags | 124 | super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags |
125 | end | 125 | end |
126 | end | 126 | end |
127 | 127 | ||
128 | # A WikipediaPage is a WebDocument. | 128 | # A WikipediaPage is a WebDocument. |
129 | class WikipediaPage < WebDocument | 129 | class WikipediaPage < WebDocument |
130 | require 'rexml/document' | 130 | require 'rexml/document' |
131 | require 'net/http' | 131 | require 'net/http' |
132 | require 'kconv' | 132 | require 'kconv' |
133 | 133 | ||
134 | 134 | ||
135 | def self.search_wikipedia_titles(name) | 135 | def self.search_wikipedia_titles(name) |
136 | raise ArgumentError, "Bad encoding", name unless name.isutf8 | 136 | raise ArgumentError, "Bad encoding", name unless name.isutf8 |
137 | 137 | ||
138 | res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search'] | 138 | res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search'] |
139 | 139 | ||
140 | res.collect { |e| e.attributes['title'] } unless res.nil? | 140 | res.collect { |e| e.attributes['title'] } unless res.nil? |
141 | end | 141 | end |
142 | 142 | ||
143 | def self.get_url(name) | 143 | def self.get_url(name) |
144 | raise ArgumentError, "Bad encoding", name unless name.isutf8 | 144 | raise ArgumentError, "Bad encoding", name unless name.isutf8 |
145 | 145 | ||
146 | atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes | 146 | atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes |
147 | 147 | ||
148 | atts['fullurl'] if atts['missing'].nil? | 148 | atts['fullurl'] if atts['missing'].nil? |
149 | end | 149 | end |
150 | 150 | ||
151 | def self.search_homepage(name) | 151 | def self.search_homepage(name) |
152 | title = WikipediaPage.search_wikipedia_titles name | 152 | title = WikipediaPage.search_wikipedia_titles name |
153 | 153 | ||
154 | WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? | 154 | WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? |
155 | end | 155 | end |
156 | 156 | ||
157 | # def initialize(name) | 157 | # def initialize(name) |
158 | # title = WikipediaPage.search_wikipedia_titles name | 158 | # title = WikipediaPage.search_wikipedia_titles name |
159 | # raise ArgumentError, "No page found" if title.empty? | 159 | # raise ArgumentError, "No page found" if title.empty? |
160 | # super WikipediaPage.get_url title[0] | 160 | # super WikipediaPage.get_url title[0] |
161 | # end | 161 | # end |
162 | end | 162 | end |
163 | end | 163 | end |
164 | 164 |
lib/mirimiri/query.rb
1 | #!/usr/bin/env ruby | 1 | #!/usr/bin/env ruby |
2 | 2 | ||
3 | #-- | 3 | #-- |
4 | # This file is a part of the mirimiri library | 4 | # This file is a part of the mirimiri library |
5 | # | 5 | # |
6 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | 6 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> |
7 | # | 7 | # |
8 | # This program is free software: you can redistribute it and/or modify | 8 | # This program is free software: you can redistribute it and/or modify |
9 | # it under the terms of the GNU General Public License as published by | 9 | # it under the terms of the GNU General Public License as published by |
10 | # the Free Software Foundation, either version 3 of the License, or | 10 | # the Free Software Foundation, either version 3 of the License, or |
11 | # (at your option) any later version. | 11 | # (at your option) any later version. |
12 | # | 12 | # |
13 | # This program is distributed in the hope that it will be useful, | 13 | # This program is distributed in the hope that it will be useful, |
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | # GNU General Public License for more details. | 16 | # GNU General Public License for more details. |
17 | # | 17 | # |
18 | # You should have received a copy of the GNU General Public License | 18 | # You should have received a copy of the GNU General Public License |
19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | 19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
20 | #++ | 20 | #++ |
21 | 21 | ||
22 | class Query | 22 | class Query |
23 | end | 23 | end |
24 | 24 | ||
25 | module Indri | 25 | module Indri |
26 | 26 | ||
27 | class Parameters | 27 | class Parameters |
28 | attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline | 28 | attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline |
29 | 29 | ||
30 | def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false) | 30 | def initialize(corpus,count="1000",mem="1g",offset="1",run_id="default",print_query=false,print_docs=false) |
31 | @index_path = corpus | 31 | @index_path = corpus |
32 | @memory = mem | 32 | @memory = mem |
33 | @count = count | 33 | @count = count |
34 | @offset = offset | 34 | @offset = offset |
35 | @run_id = run_id | 35 | @run_id = run_id |
36 | @print_query = print_query ? "true" : "false" | 36 | @print_query = print_query ? "true" : "false" |
37 | @print_docs = print_docs ? "true" : "false" | 37 | @print_docs = print_docs ? "true" : "false" |
38 | end | 38 | end |
39 | 39 | ||
40 | def to_s | 40 | def to_s |
41 | h = "<parameters>\n" | 41 | h = "<memory>#{@memory}</memory>\n" |
42 | h += "<memory>#{@memory}</memory>\n" | ||
43 | h += "<index>#{@index_path}</index>\n" | 42 | h += "<index>#{@index_path}</index>\n" |
44 | h += "<count>#{@count}</count>\n" | 43 | h += "<count>#{@count}</count>\n" |
45 | unless @baseline.nil? | 44 | unless @baseline.nil? |
46 | h += "<baseline>#{@baseline}</baseline>\n" | 45 | h += "<baseline>#{@baseline}</baseline>\n" |
47 | else | 46 | else |
48 | h += "<rule>#{@rule}</rule>\n" | 47 | h += "<rule>#{@rule}</rule>\n" |
49 | end | 48 | end |
49 | h += "<trecFormat>true</trecFormat>\n" | ||
50 | h += "<queryOffset>#{@offset}</queryOffset>\n" | 50 | h += "<queryOffset>#{@offset}</queryOffset>\n" |
51 | h += "<runID>#{@run_id}</runID>\n" | 51 | h += "<runID>#{@run_id}</runID>\n" |
52 | h += "<printQuery>#{@print_query}</printQuery>\n" | 52 | h += "<printQuery>#{@print_query}</printQuery>\n" |
53 | h += "<printDocuments>#{@print_docs}</printDocuments>\n" | 53 | h += "<printDocuments>#{@print_docs}</printDocuments>\n" |
54 | 54 | ||
55 | h | 55 | h |
56 | end | 56 | end |
57 | end | 57 | end |
58 | 58 | ||
59 | class IndriQuery < Query | 59 | class IndriQuery < Query |
60 | attr_accessor :id, :query, :params, :rule | 60 | attr_accessor :id, :query, :rule |
61 | 61 | ||
62 | def initialize(id,query,params) | 62 | def initialize(id,query) |
63 | @params = params | ||
64 | # Here we set the default retrieval model as Language Modeling | ||
65 | # with a Dirichlet smoothing at 2500. | ||
66 | # TODO: maybe a Rule class... | ||
67 | @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? | ||
68 | |||
69 | @id = id | 63 | @id = id |
70 | @query = query | 64 | @query = query |
71 | end | 65 | end |
72 | 66 | ||
73 | def to_s | 67 | def to_s |
74 | h = @params.to_s | 68 | h = "<query>\n" |
75 | h += "<query>\n" | ||
76 | h += "<number>#{@id}</number>\n" | 69 | h += "<number>#{@id}</number>\n" |
77 | h += "<text>#{@query}</text>\n" | 70 | h += "<text>#{@query}</text>\n" |
78 | h += "</query>\n" | 71 | h += "</query>\n" |
72 | |||
73 | h | ||
74 | end | ||
75 | end | ||
76 | |||
77 | class IndriQueries | ||
78 | attr_accessor :params, :queries | ||
79 | |||
80 | def initialize(params,*queries) | ||
81 | @queries = queries | ||
82 | |||
83 | @params = params | ||
84 | # Here we set the default retrieval model as Language Modeling | ||
85 | # with a Dirichlet smoothing at 2500. | ||
86 | # TODO: maybe a Rule class... | ||
87 | @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? | ||
88 | end | ||
89 | |||
90 | def to_s | ||
91 | h = "<parameters>\n" | ||
92 | h += @params.to_s | ||
93 | h += @queries.collect { |q| q.to_s }.join "" |
lib/mirimiri/string.rb
1 | #!/usr/bin/env ruby | 1 | #!/usr/bin/env ruby |
2 | 2 | ||
3 | #-- | 3 | #-- |
4 | # This file is a part of the mirimiri library | 4 | # This file is a part of the mirimiri library |
5 | # | 5 | # |
6 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | 6 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> |
7 | # | 7 | # |
8 | # This program is free software: you can redistribute it and/or modify | 8 | # This program is free software: you can redistribute it and/or modify |
9 | # it under the terms of the GNU General Public License as published by | 9 | # it under the terms of the GNU General Public License as published by |
10 | # the Free Software Foundation, either version 3 of the License, or | 10 | # the Free Software Foundation, either version 3 of the License, or |
11 | # (at your option) any later version. | 11 | # (at your option) any later version. |
12 | # | 12 | # |
13 | # This program is distributed in the hope that it will be useful, | 13 | # This program is distributed in the hope that it will be useful, |
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | # GNU General Public License for more details. | 16 | # GNU General Public License for more details. |
17 | # | 17 | # |
18 | # You should have received a copy of the GNU General Public License | 18 | # You should have received a copy of the GNU General Public License |
19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | 19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
20 | #++ | 20 | #++ |
21 | 21 | ||
22 | module Mirimiri | 22 | module Mirimiri |
23 | 23 | ||
24 | # These are the default stopwords provided by Lemur. | 24 | # These are the default stopwords provided by Lemur. |
25 | Stoplist = [ | 25 | Stoplist = [ |
26 | "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av", | 26 | "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av", |
27 | "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", | 27 | "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", |
28 | "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", | 28 | "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", |
29 | "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu", | 29 | "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu", |
30 | "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during", | 30 | "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during", |
31 | "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every", | 31 | "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every", |
32 | "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting", | 32 | "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting", |
33 | "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff", | 33 | "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff", |
34 | "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore", | 34 | "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore", |
35 | "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he", | 35 | "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he", |
36 | "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto", | 36 | "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto", |
37 | "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto", | 37 | "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto", |
38 | "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include", | 38 | "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include", |
39 | "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into", | 39 | "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into", |
40 | "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last", | 40 | "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last", |
41 | "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe", | 41 | "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe", |
42 | "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs", | 42 | "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs", |
43 | "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless", | 43 | "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless", |
44 | "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing", | 44 | "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing", |
45 | "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once", | 45 | "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once", |
46 | "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", | 46 | "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", |
47 | "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite", | 47 | "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite", |
48 | "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing", | 48 | "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing", |
49 | "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt", | 49 | "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt", |
50 | "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote", | 50 | "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote", |
51 | "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", | 51 | "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", |
52 | "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave", | 52 | "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave", |
53 | "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them", | 53 | "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them", |
54 | "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts", | 54 | "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts", |
55 | "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", | 55 | "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", |
56 | "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru", | 56 | "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru", |
57 | "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh", | 57 | "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh", |
58 | "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward", | 58 | "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward", |
59 | "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week", | 59 | "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week", |
60 | "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", | 60 | "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", |
61 | "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore", | 61 | "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore", |
62 | "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", | 62 | "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", |
63 | "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever", | 63 | "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever", |
64 | "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom", | 64 | "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom", |
65 | "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within", | 65 | "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within", |
66 | "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your", | 66 | "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your", |
67 | "yours", "yourself", "yourselves" | 67 | "yours", "yourself", "yourselves" |
68 | ] | 68 | ] |
69 | 69 | ||
70 | Transmap = { | ||
71 | "\xC3\x80" => "A", "\xC3\x81" => "A", "\xC3\x82" => "A", "\xC3\x83" => "A", | ||
72 | "\xC3\x84" => "A", "\xC3\x85" => "A", "\xC3\x86" => "AE","\xC3\x87" => "C", | ||
73 | "\xC3\x88" => "E", "\xC3\x89" => "E", "\xC3\x8A" => "E", "\xC3\x8B" => "E", | ||
74 | "\xC3\x8C" => "I", "\xC3\x8D" => "I", "\xC3\x8E" => "I", "\xC3\x8F" => "I", | ||
75 | "\xC3\x90" => "D", "\xC3\x91" => "N", "\xC3\x92" => "O", "\xC3\x93" => "O", | ||
76 | "\xC3\x94" => "O", "\xC3\x95" => "O", "\xC3\x96" => "O", "\xC3\x98" => "O", | ||
77 | "\xC3\x99" => "U", "\xC3\x9A" => "U", "\xC3\x9B" => "U", "\xC3\x9C" => "U", | ||
78 | "\xC3\x9D" => "Y", "\xC3\x9E" => "P", "\xC3\x9F" => "ss", | ||
79 | "\xC3\xA0" => "a", "\xC3\xA1" => "a", "\xC3\xA2" => "a", "\xC3\xA3" => "a", | ||
80 | "\xC3\xA4" => "a", "\xC3\xA5" => "a", "\xC3\xA6" => "ae","\xC3\xA7" => "c", | ||
81 | "\xC3\xA8" => "e", "\xC3\xA9" => "e", "\xC3\xAA" => "e", "\xC3\xAB" => "e", | ||
82 | "\xC3\xAC" => "i", "\xC3\xAD" => "i", "\xC3\xAE" => "i", "\xC3\xAF" => "i", | ||
83 | "\xC3\xB0" => "o", "\xC3\xB1" => "n", "\xC3\xB2" => "o", "\xC3\xB3" => "o", | ||
84 | "\xC3\xB4" => "o", "\xC3\xB5" => "o", "\xC3\xB6" => "o", "\xC3\xB8" => "o", | ||
85 | "\xC3\xB9" => "u", "\xC3\xBA" => "u", "\xC3\xBB" => "u", "\xC3\xBC" => "u", | ||
86 | "\xC3\xBD" => "y", "\xC3\xBE" => "p", "\xC3\xBF" => "y", | ||
87 | "\xC4\x80" => "A", "\xC4\x81" => "a", "\xC4\x82" => "A", "\xC4\x83" => "a", | ||
88 | "\xC4\x84" => "A", "\xC4\x85" => "a", "\xC4\x86" => "C", "\xC4\x87" => "c", | ||
89 | "\xC4\x88" => "C", "\xC4\x89" => "c", "\xC4\x8A" => "C", "\xC4\x8B" => "c", | ||
90 | "\xC4\x8C" => "C", "\xC4\x8D" => "c", "\xC4\x8E" => "D", "\xC4\x8F" => "d", | ||
91 | "\xC4\x90" => "D", "\xC4\x91" => "d", "\xC4\x92" => "E", "\xC4\x93" => "e", | ||
92 | "\xC4\x94" => "E", "\xC4\x95" => "e", "\xC4\x96" => "E", "\xC4\x97" => "e", | ||
93 | "\xC4\x98" => "E", "\xC4\x99" => "e", "\xC4\x9A" => "E", "\xC4\x9B" => "e", | ||
94 | "\xC4\x9C" => "G", "\xC4\x9D" => "g", "\xC4\x9E" => "G", "\xC4\x9F" => "g", | ||
95 | "\xC4\xA0" => "G", "\xC4\xA1" => "g", "\xC4\xA2" => "G", "\xC4\xA3" => "g", | ||
96 | "\xC4\xA4" => "H", "\xC4\xA5" => "h", "\xC4\xA6" => "H", "\xC4\xA7" => "h", | ||
97 | "\xC4\xA8" => "I", "\xC4\xA9" => "i", "\xC4\xAA" => "I", "\xC4\xAB" => "i", | ||
98 | "\xC4\xAC" => "I", "\xC4\xAD" => "i", "\xC4\xAE" => "I", "\xC4\xAF" => "i", | ||
99 | "\xC4\xB0" => "I", "\xC4\xB1" => "i", "\xC4\xB2" => "IJ","\xC4\xB3" => "ij", | ||
100 | "\xC4\xB4" => "J", "\xC4\xB5" => "j", "\xC4\xB6" => "K", "\xC4\xB7" => "k", | ||
101 | "\xC4\xB8" => "k", "\xC4\xB9" => "L", "\xC4\xBA" => "l", "\xC4\xBB" => "L", | ||
102 | "\xC4\xBC" => "l", "\xC4\xBD" => "L", "\xC4\xBE" => "l", "\xC4\xBF" => "L", | ||
103 | "\xC5\x80" => "l", "\xC5\x81" => "L", "\xC5\x82" => "l", "\xC5\x83" => "N", | ||
104 | "\xC5\x84" => "n", "\xC5\x85" => "N", "\xC5\x86" => "n", "\xC5\x87" => "N", | ||
105 | "\xC5\x88" => "n", "\xC5\x89" => "n", "\xC5\x8A" => "N", "\xC5\x8B" => "n", | ||
106 | "\xC5\x8C" => "O", "\xC5\x8D" => "o", "\xC5\x8E" => "O", "\xC5\x8F" => "o", | ||
107 | "\xC5\x90" => "O", "\xC5\x91" => "o", "\xC5\x92" => "CE","\xC5\x93" => "ce", | ||
108 | "\xC5\x94" => "R", "\xC5\x95" => "r", "\xC5\x96" => "R", "\xC5\x97" => "r", | ||
109 | "\xC5\x98" => "R", "\xC5\x99" => "r", "\xC5\x9A" => "S", "\xC5\x9B" => "s", | ||
110 | "\xC5\x9C" => "S", "\xC5\x9D" => "s", "\xC5\x9E" => "S", "\xC5\x9F" => "s", | ||
111 | "\xC5\xA0" => "S", "\xC5\xA1" => "s", "\xC5\xA2" => "T", "\xC5\xA3" => "t", | ||
112 | "\xC5\xA4" => "T", "\xC5\xA5" => "t", "\xC5\xA6" => "T", "\xC5\xA7" => "t", | ||
113 | "\xC5\xA8" => "U", "\xC5\xA9" => "u", "\xC5\xAA" => "U", "\xC5\xAB" => "u", | ||
114 | "\xC5\xAC" => "U", "\xC5\xAD" => "u", "\xC5\xAE" => "U", "\xC5\xAF" => "u", | ||
115 | "\xC5\xB0" => "U", "\xC5\xB1" => "u", "\xC5\xB2" => "U", "\xC5\xB3" => "u", | ||
116 | "\xC5\xB4" => "W", "\xC5\xB5" => "w", "\xC5\xB6" => "Y", "\xC5\xB7" => "y", | ||
117 | "\xC5\xB8" => "Y", "\xC5\xB9" => "Z", "\xC5\xBA" => "z", "\xC5\xBB" => "Z", | ||
118 | "\xC5\xBC" => "z", "\xC5\xBD" => "Z", "\xC5\xBE" => "z", "\xC6\x8F" => "E", | ||
119 | "\xC6\xA0" => "O", "\xC6\xA1" => "o", "\xC6\xAF" => "U", "\xC6\xB0" => "u", | ||
120 | "\xC7\x8D" => "A", "\xC7\x8E" => "a", "\xC7\x8F" => "I", | ||
121 | "\xC7\x90" => "i", "\xC7\x91" => "O", "\xC7\x92" => "o", "\xC7\x93" => "U", | ||
122 | "\xC7\x94" => "u", "\xC7\x95" => "U", "\xC7\x96" => "u", "\xC7\x97" => "U", | ||
123 | "\xC7\x98" => "u", "\xC7\x99" => "U", "\xC7\x9A" => "u", "\xC7\x9B" => "U", | ||
124 | "\xC7\x9C" => "u", | ||
125 | "\xC7\xBA" => "A", "\xC7\xBB" => "a", "\xC7\xBC" => "AE","\xC7\xBD" => "ae", | ||
126 | "\xC7\xBE" => "O", "\xC7\xBF" => "o", | ||
127 | "\xC9\x99" => "e", | ||
128 | "\xC2\x82" => ",", # High code comma | ||
129 | "\xC2\x84" => ",,", # High code double comma | ||
130 | "\xC2\x85" => "...", # Tripple dot | ||
131 | "\xC2\x88" => "^", # High carat | ||
132 | "\xC2\x91" => "\x27", # Forward single quote | ||
133 | "\xC2\x92" => "\x27", # Reverse single quote | ||
134 | "\xC2\x93" => "\x22", # Forward double quote | ||
135 | "\xC2\x94" => "\x22", # Reverse double quote | ||
136 | "\xC2\x96" => "-", # High hyphen | ||
137 | "\xC2\x97" => "--", # Double hyphen | ||
138 | "\xC2\xA6" => "|", # Split vertical bar | ||
139 | "\xC2\xAB" => "<<", # Double less than | ||
140 | "\xC2\xBB" => ">>", # Double greater than | ||
141 | "\xC2\xBC" => "1/4", # one quarter | ||
142 | "\xC2\xBD" => "1/2", # one half | ||
143 | "\xC2\xBE" => "3/4", # three quarters | ||
144 | "\xCA\xBF" => "\x27", # c-single quote | ||
145 | "\xCC\xA8" => "", # modifier - under curve | ||
146 | "\xCC\xB1" => "", # modifier - under line | ||
147 | /\W/ => "" | ||
148 | } | ||
70 | 149 | ||
71 | end | 150 | end |
72 | 151 | ||
73 | # Extention of the standard class String with useful function. | 152 | # Extention of the standard class String with useful function. |
74 | class String | 153 | class String |
75 | include Mirimiri | 154 | include Mirimiri |
155 | |||
156 | def unaccent | ||
157 | # force_encoding is needed with ruby1.9 | ||
158 | Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) } | ||
159 | end | ||
76 | 160 | ||
77 | # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise. | 161 | # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise. |
78 | def is_stopword? | 162 | def is_stopword? |
79 | Stoplist.include?(self.downcase) | 163 | Stoplist.include?(self.downcase) |
80 | end | 164 | end |
81 | 165 | ||
82 | # Do not use. | 166 | # Do not use. |
83 | # TODO: rewamp. find why this function is here. | 167 | # TODO: rewamp. find why this function is here. |
84 | def remove_special_characters | 168 | def remove_special_characters |
85 | self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ') | 169 | self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ') |
86 | end | 170 | end |
87 | 171 | ||
88 | # Removes all XML-like tags from +self+. | 172 | # Removes all XML-like tags from +self+. |
89 | # | 173 | # |
90 | # s = "<html><body>test</body></html>" | 174 | # s = "<html><body>test</body></html>" |
91 | # s.strip_xml_tags! | 175 | # s.strip_xml_tags! |
92 | # s #=> "test" | 176 | # s #=> "test" |
93 | def strip_xml_tags! | 177 | def strip_xml_tags! |
94 | replace strip_with_pattern /<\/?[^>]*>/ | 178 | replace strip_with_pattern /<\/?[^>]*>/ |
95 | end | 179 | end |
96 | 180 | ||
97 | # Removes all XML-like tags from +self+. | 181 | # Removes all XML-like tags from +self+. |
98 | # | 182 | # |
99 | # s = "<html><body>test</body></html>" | 183 | # s = "<html><body>test</body></html>" |
100 | # s.strip_xml_tags #=> "test" | 184 | # s.strip_xml_tags #=> "test" |
101 | # s #=> "<html><body>test</body></html>" | 185 | # s #=> "<html><body>test</body></html>" |
102 | def strip_xml_tags | 186 | def strip_xml_tags |
103 | dup.strip_xml_tags! | 187 | dup.strip_xml_tags! |
104 | end | 188 | end |
105 | 189 | ||
106 | # Removes all Javascript sources from +self+. | 190 | # Removes all Javascript sources from +self+. |
107 | # | 191 | # |
108 | # s = "<script type='text/javascript'> | 192 | # s = "<script type='text/javascript'> |
109 | # var skin='vector', | 193 | # var skin='vector', |
110 | # stylepath='http://bits.wikimedia.org/skins-1.5' | 194 | # stylepath='http://bits.wikimedia.org/skins-1.5' |
111 | # </script> | 195 | # </script> |
112 | # | 196 | # |
113 | # test" | 197 | # test" |
114 | # s.strip_javascripts! | 198 | # s.strip_javascripts! |
115 | # s #=> "test" | 199 | # s #=> "test" |
116 | def strip_javascripts! | 200 | def strip_javascripts! |
117 | replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m | 201 | replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m |
118 | end | 202 | end |
119 | 203 | ||
120 | # Removes all Javascript sources from +self+. | 204 | # Removes all Javascript sources from +self+. |
121 | # | 205 | # |
122 | # s = "<script type='text/javascript'> | 206 | # s = "<script type='text/javascript'> |
123 | # var skin='vector', | 207 | # var skin='vector', |
124 | # stylepath='http://bits.wikimedia.org/skins-1.5' | 208 | # stylepath='http://bits.wikimedia.org/skins-1.5' |
125 | # </script> | 209 | # </script> |
126 | # | 210 | # |
127 | # test" | 211 | # test" |
128 | # s.strip_javascripts #=> "test" | 212 | # s.strip_javascripts #=> "test" |
129 | def strip_javascripts | 213 | def strip_javascripts |
130 | dup.strip_javascripts! | 214 | dup.strip_javascripts! |
131 | end | 215 | end |
132 | 216 | ||
133 | def strip_stylesheets! | 217 | def strip_stylesheets! |
134 | # TODO: rewamp. dunno what is it. | 218 | # TODO: rewamp. dunno what is it. |
135 | replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m | 219 | replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m |
136 | end | 220 | end |
137 | 221 | ||
138 | def strip_stylesheets | 222 | def strip_stylesheets |
139 | dup.strip_stylesheets! | 223 | dup.strip_stylesheets! |
140 | end | 224 | end |
141 | 225 | ||
142 | # Removes punctuation from +self+. | 226 | # Removes punctuation from +self+. |
143 | # | 227 | # |
144 | # s = "hello, world. how are you?!" | 228 | # s = "hello, world. how are you?!" |
145 | # s.strip_punctuation! | 229 | # s.strip_punctuation! |
146 | # s # => "hello world how are you" | 230 | # s # => "hello world how are you" |
147 | def strip_punctuation! | 231 | def strip_punctuation! |
148 | replace strip_with_pattern /[^a-zA-Z0-9\-\s]/ | 232 | replace strip_with_pattern /[^a-zA-Z0-9\-\s]/ |
149 | end | 233 | end |
150 | 234 | ||
151 | # Removes punctuation from +self+. | 235 | # Removes punctuation from +self+. |
152 | # | 236 | # |
153 | # s = "hello, world. how are you?!" | 237 | # s = "hello, world. how are you?!" |
154 | # s.strip_punctuation # => "hello world how are you" | 238 | # s.strip_punctuation # => "hello world how are you" |
155 | def strip_punctuation | 239 | def strip_punctuation |
156 | dup.strip_punctuation! | 240 | dup.strip_punctuation! |
157 | end | 241 | end |
158 | 242 | ||
159 | # Returns the text values inside all occurences of a XML tag in +self+ | 243 | # Returns the text values inside all occurences of a XML tag in +self+ |
160 | # | 244 | # |
161 | # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre" | 245 | # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre" |
162 | # s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"] | 246 | # s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"] |
163 | def extract_xmltags_values(tag_name) | 247 | def extract_xmltags_values(tag_name) |
164 | self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten | 248 | self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten |
165 | end | 249 | end |
166 | 250 | ||
167 | def strip_with_pattern(pattern) | 251 | def strip_with_pattern(pattern) |
168 | require 'cgi' | 252 | require 'cgi' |
169 | require 'kconv' | 253 | require 'kconv' |
170 | CGI::unescapeHTML(self.gsub(pattern,"")).toutf8 | 254 | CGI::unescapeHTML(self.gsub(pattern,"")).toutf8 |
171 | end | 255 | end |
172 | 256 | ||
173 | private :strip_with_pattern | 257 | private :strip_with_pattern |
174 | end | 258 | end |
175 | 259 |