Commit 845768f8ac5a1593db356377fcc68208c12efa74

Authored by Romain Deveaud
1 parent 175908fe2a
Exists in master

creating a group of indri queries is possible. added an accent stripping function.

Showing 4 changed files with 116 additions and 17 deletions Inline Diff

1 require 'rir' 1 require 'mirimiri'
2 2
3 # Concatenates all lines from one file, without \n 3 # Concatenates all lines from one file, without \n
4 readme = File.open('README.markdown').readlines.collect { |l| l.chomp }.join(" ") 4 readme = File.open('README.markdown').readlines.collect { |l| l.chomp }.join(" ")
5 5
6 # Creates the document with a string 6 # Creates the document with a string
7 doc = RIR::Document.new readme 7 doc = Mirimiri::Document.new readme
8 8
9 # Outputs all the unique words of the document with their entropy scores 9 # Outputs all the unique words of the document with their entropy scores
10 p doc.words.collect { |w| "#{w} => #{doc.entropy w}" } 10 p doc.words.collect { |w| "#{w} => #{doc.entropy w}" }
11 11
lib/mirimiri/document.rb
1 #!/usr/bin/env ruby 1 #!/usr/bin/env ruby
2 2
3 #-- 3 #--
4 # This file is a part of the mirimiri library 4 # This file is a part of the mirimiri library
5 # 5 #
6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> 6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7 # 7 #
8 # This program is free software: you can redistribute it and/or modify 8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by 9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or 10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version. 11 # (at your option) any later version.
12 # 12 #
13 # This program is distributed in the hope that it will be useful, 13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details. 16 # GNU General Public License for more details.
17 # 17 #
18 # You should have received a copy of the GNU General Public License 18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>. 19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #++ 20 #++
21 21
22 22
23 # General module 23 # General module
24 module Mirimiri 24 module Mirimiri
25 25
26 # A Document is a bag of words and is constructed from a string. 26 # A Document is a bag of words and is constructed from a string.
27 class Document 27 class Document
28 attr_reader :words, :doc_content 28 attr_reader :words, :doc_content
29 29
30 # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html 30 # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
31 # and the \\W special escape). 31 # and the \\W special escape).
32 # 32 #
33 # Protected function, only meant to by called at the initialization. 33 # Protected function, only meant to by called at the initialization.
34 def format_words 34 def format_words
35 wo = [] 35 wo = []
36 36
37 @doc_content.split.each do |w| 37 @doc_content.split.each do |w|
38 w.split(/\W/).each do |sw| 38 w.split(/\W/).each do |sw|
39 wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ 39 wo.push(sw.downcase) if sw =~ /[a-zA-Z]/
40 end 40 end
41 end 41 end
42 42
43 wo 43 wo
44 end 44 end
45 45
46 # Returns an Array containing the +n+-grams (words) from the current Document. 46 # Returns an Array containing the +n+-grams (words) from the current Document.
47 # 47 #
48 # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] 48 # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
49 def ngrams(n) 49 def ngrams(n)
50 window = [] 50 window = []
51 ngrams_array = [] 51 ngrams_array = []
52 52
53 @words.each do |w| 53 @words.each do |w|
54 window.push(w) 54 window.push(w)
55 if window.size == n 55 if window.size == n
56 ngrams_array.push window.join(" ") 56 ngrams_array.push window.join(" ")
57 window.delete_at(0) 57 window.delete_at(0)
58 end 58 end
59 end 59 end
60 60
61 ngrams_array.uniq 61 ngrams_array.uniq
62 end 62 end
63 63
64 # Returns a Hash containing the words and their associated counts in the current Document. 64 # Returns a Hash containing the words and their associated counts in the current Document.
65 # 65 #
66 # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } 66 # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
67 def count_words 67 def count_words
68 counts = Hash.new { |h,k| h[k] = 0 } 68 counts = Hash.new { |h,k| h[k] = 0 }
69 @words.each { |w| counts[w] += 1 } 69 @words.each { |w| counts[w] += 1 }
70 70
71 counts 71 counts
72 end 72 end
73 73
74 # Computes the entropy of a given string +s+ inside the document. 74 # Computes the entropy of a given string +s+ inside the document.
75 # 75 #
76 # If the string parameter is composed of many words (i.e. tokens separated 76 # If the string parameter is composed of many words (i.e. tokens separated
77 # by whitespace(s)), it is considered as an ngram. 77 # by whitespace(s)), it is considered as an ngram.
78 # 78 #
79 # entropy("guitar") #=> 0.00432114812727959 79 # entropy("guitar") #=> 0.00432114812727959
80 # entropy("dillinger escape plan") #=> 0.265862076325102 80 # entropy("dillinger escape plan") #=> 0.265862076325102
81 def entropy(s) 81 def entropy(s)
82 en = 0.0 82 en = 0.0
83 counts = self.count_words 83 counts = self.count_words
84 84
85 s.split.each do |w| 85 s.split.each do |w|
86 p_wi = counts[w].to_f/@words.count.to_f 86 p_wi = counts[w].to_f/@words.count.to_f
87 en += p_wi*Math.log2(p_wi) 87 en += p_wi*Math.log2(p_wi)
88 end 88 end
89 89
90 en *= -1 90 en *= -1
91 en 91 en
92 end 92 end
93 93
94 # Computes the term frequency of a given *word* +s+. 94 # Computes the term frequency of a given *word* +s+.
95 # 95 #
96 # tf("guitar") #=> 0.000380372765310004 96 # tf("guitar") #=> 0.000380372765310004
97 def tf(s) 97 def tf(s)
98 self.count_words[s].to_f/@words.size.to_f 98 self.count_words[s].to_f/@words.size.to_f
99 end 99 end
100 100
101 101
102 def initialize(content) 102 def initialize(content="")
103 @doc_content = content 103 @doc_content = content
104 @words = format_words 104 @words = format_words
105 end 105 end
106 106
107 protected :format_words 107 protected :format_words
108 end 108 end
109 109
110 # A WebDocument is a Document with a +url+. 110 # A WebDocument is a Document with a +url+.
111 class WebDocument < Document 111 class WebDocument < Document
112 attr_reader :url 112 attr_reader :url
113 113
114 # Returns the HTML text from the page of a given +url+. 114 # Returns the HTML text from the page of a given +url+.
115 def self.get_content(url) 115 def self.get_content(url)
116 require 'net/http' 116 require 'net/http'
117 Net::HTTP.get(URI.parse(url)) 117 Net::HTTP.get(URI.parse(url))
118 end 118 end
119 119
120 # WebDocument constructor, the content of the Document is the HTML page 120 # WebDocument constructor, the content of the Document is the HTML page
121 # without the tags. 121 # without the tags.
122 def initialize(url) 122 def initialize(url)
123 @url = url 123 @url = url
124 super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags 124 super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags
125 end 125 end
126 end 126 end
127 127
128 # A WikipediaPage is a WebDocument. 128 # A WikipediaPage is a WebDocument.
129 class WikipediaPage < WebDocument 129 class WikipediaPage < WebDocument
130 require 'rexml/document' 130 require 'rexml/document'
131 require 'net/http' 131 require 'net/http'
132 require 'kconv' 132 require 'kconv'
133 133
134 134
135 def self.search_wikipedia_titles(name) 135 def self.search_wikipedia_titles(name)
136 raise ArgumentError, "Bad encoding", name unless name.isutf8 136 raise ArgumentError, "Bad encoding", name unless name.isutf8
137 137
138 res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search'] 138 res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search']
139 139
140 res.collect { |e| e.attributes['title'] } unless res.nil? 140 res.collect { |e| e.attributes['title'] } unless res.nil?
141 end 141 end
142 142
143 def self.get_url(name) 143 def self.get_url(name)
144 raise ArgumentError, "Bad encoding", name unless name.isutf8 144 raise ArgumentError, "Bad encoding", name unless name.isutf8
145 145
146 atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes 146 atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes
147 147
148 atts['fullurl'] if atts['missing'].nil? 148 atts['fullurl'] if atts['missing'].nil?
149 end 149 end
150 150
151 def self.search_homepage(name) 151 def self.search_homepage(name)
152 title = WikipediaPage.search_wikipedia_titles name 152 title = WikipediaPage.search_wikipedia_titles name
153 153
154 WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? 154 WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?
155 end 155 end
156 156
157 # def initialize(name) 157 # def initialize(name)
158 # title = WikipediaPage.search_wikipedia_titles name 158 # title = WikipediaPage.search_wikipedia_titles name
159 # raise ArgumentError, "No page found" if title.empty? 159 # raise ArgumentError, "No page found" if title.empty?
160 # super WikipediaPage.get_url title[0] 160 # super WikipediaPage.get_url title[0]
161 # end 161 # end
162 end 162 end
163 end 163 end
164 164
lib/mirimiri/query.rb
1 #!/usr/bin/env ruby 1 #!/usr/bin/env ruby
2 2
3 #-- 3 #--
4 # This file is a part of the mirimiri library 4 # This file is a part of the mirimiri library
5 # 5 #
6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> 6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7 # 7 #
8 # This program is free software: you can redistribute it and/or modify 8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by 9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or 10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version. 11 # (at your option) any later version.
12 # 12 #
13 # This program is distributed in the hope that it will be useful, 13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details. 16 # GNU General Public License for more details.
17 # 17 #
18 # You should have received a copy of the GNU General Public License 18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>. 19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #++ 20 #++
21 21
22 class Query 22 class Query
23 end 23 end
24 24
25 module Indri 25 module Indri
26 26
27 class Parameters 27 class Parameters
28 attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline 28 attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline
29 29
30 def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false) 30 def initialize(corpus,count="1000",mem="1g",offset="1",run_id="default",print_query=false,print_docs=false)
31 @index_path = corpus 31 @index_path = corpus
32 @memory = mem 32 @memory = mem
33 @count = count 33 @count = count
34 @offset = offset 34 @offset = offset
35 @run_id = run_id 35 @run_id = run_id
36 @print_query = print_query ? "true" : "false" 36 @print_query = print_query ? "true" : "false"
37 @print_docs = print_docs ? "true" : "false" 37 @print_docs = print_docs ? "true" : "false"
38 end 38 end
39 39
40 def to_s 40 def to_s
41 h = "<parameters>\n" 41 h = "<memory>#{@memory}</memory>\n"
42 h += "<memory>#{@memory}</memory>\n"
43 h += "<index>#{@index_path}</index>\n" 42 h += "<index>#{@index_path}</index>\n"
44 h += "<count>#{@count}</count>\n" 43 h += "<count>#{@count}</count>\n"
45 unless @baseline.nil? 44 unless @baseline.nil?
46 h += "<baseline>#{@baseline}</baseline>\n" 45 h += "<baseline>#{@baseline}</baseline>\n"
47 else 46 else
48 h += "<rule>#{@rule}</rule>\n" 47 h += "<rule>#{@rule}</rule>\n"
49 end 48 end
49 h += "<trecFormat>true</trecFormat>\n"
50 h += "<queryOffset>#{@offset}</queryOffset>\n" 50 h += "<queryOffset>#{@offset}</queryOffset>\n"
51 h += "<runID>#{@run_id}</runID>\n" 51 h += "<runID>#{@run_id}</runID>\n"
52 h += "<printQuery>#{@print_query}</printQuery>\n" 52 h += "<printQuery>#{@print_query}</printQuery>\n"
53 h += "<printDocuments>#{@print_docs}</printDocuments>\n" 53 h += "<printDocuments>#{@print_docs}</printDocuments>\n"
54 54
55 h 55 h
56 end 56 end
57 end 57 end
58 58
59 class IndriQuery < Query 59 class IndriQuery < Query
60 attr_accessor :id, :query, :params, :rule 60 attr_accessor :id, :query, :rule
61 61
62 def initialize(id,query,params) 62 def initialize(id,query)
63 @params = params
64 # Here we set the default retrieval model as Language Modeling
65 # with a Dirichlet smoothing at 2500.
66 # TODO: maybe a Rule class...
67 @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil?
68
69 @id = id 63 @id = id
70 @query = query 64 @query = query
71 end 65 end
72 66
73 def to_s 67 def to_s
74 h = @params.to_s 68 h = "<query>\n"
75 h += "<query>\n"
76 h += "<number>#{@id}</number>\n" 69 h += "<number>#{@id}</number>\n"
77 h += "<text>#{@query}</text>\n" 70 h += "<text>#{@query}</text>\n"
78 h += "</query>\n" 71 h += "</query>\n"
72
73 h
74 end
75 end
76
77 class IndriQueries
78 attr_accessor :params, :queries
79
80 def initialize(params,*queries)
81 @queries = queries
82
83 @params = params
84 # Here we set the default retrieval model as Language Modeling
85 # with a Dirichlet smoothing at 2500.
86 # TODO: maybe a Rule class...
87 @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil?
88 end
89
90 def to_s
91 h = "<parameters>\n"
92 h += @params.to_s
93 h += @queries.collect { |q| q.to_s }.join ""
lib/mirimiri/string.rb
1 #!/usr/bin/env ruby 1 #!/usr/bin/env ruby
2 2
3 #-- 3 #--
4 # This file is a part of the mirimiri library 4 # This file is a part of the mirimiri library
5 # 5 #
6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> 6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7 # 7 #
8 # This program is free software: you can redistribute it and/or modify 8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by 9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or 10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version. 11 # (at your option) any later version.
12 # 12 #
13 # This program is distributed in the hope that it will be useful, 13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details. 16 # GNU General Public License for more details.
17 # 17 #
18 # You should have received a copy of the GNU General Public License 18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>. 19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #++ 20 #++
21 21
22 module Mirimiri 22 module Mirimiri
23 23
24 # These are the default stopwords provided by Lemur. 24 # These are the default stopwords provided by Lemur.
25 Stoplist = [ 25 Stoplist = [
26 "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av", 26 "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av",
27 "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", 27 "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand",
28 "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", 28 "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by",
29 "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu", 29 "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu",
30 "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during", 30 "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during",
31 "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every", 31 "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every",
32 "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting", 32 "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting",
33 "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff", 33 "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff",
34 "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore", 34 "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore",
35 "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he", 35 "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he",
36 "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto", 36 "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto",
37 "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto", 37 "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto",
38 "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include", 38 "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include",
39 "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into", 39 "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into",
40 "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last", 40 "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last",
41 "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe", 41 "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe",
42 "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs", 42 "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs",
43 "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless", 43 "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless",
44 "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing", 44 "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing",
45 "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once", 45 "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once",
46 "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", 46 "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours",
47 "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite", 47 "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite",
48 "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing", 48 "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing",
49 "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt", 49 "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt",
50 "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote", 50 "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote",
51 "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", 51 "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes",
52 "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave", 52 "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave",
53 "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them", 53 "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them",
54 "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts", 54 "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts",
55 "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", 55 "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon",
56 "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru", 56 "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru",
57 "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh", 57 "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh",
58 "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward", 58 "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward",
59 "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week", 59 "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week",
60 "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", 60 "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever",
61 "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore", 61 "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore",
62 "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", 62 "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto",
63 "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever", 63 "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever",
64 "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom", 64 "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom",
65 "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within", 65 "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within",
66 "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your", 66 "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your",
67 "yours", "yourself", "yourselves" 67 "yours", "yourself", "yourselves"
68 ] 68 ]
69 69
70 Transmap = {
71 "\xC3\x80" => "A", "\xC3\x81" => "A", "\xC3\x82" => "A", "\xC3\x83" => "A",
72 "\xC3\x84" => "A", "\xC3\x85" => "A", "\xC3\x86" => "AE","\xC3\x87" => "C",
73 "\xC3\x88" => "E", "\xC3\x89" => "E", "\xC3\x8A" => "E", "\xC3\x8B" => "E",
74 "\xC3\x8C" => "I", "\xC3\x8D" => "I", "\xC3\x8E" => "I", "\xC3\x8F" => "I",
75 "\xC3\x90" => "D", "\xC3\x91" => "N", "\xC3\x92" => "O", "\xC3\x93" => "O",
76 "\xC3\x94" => "O", "\xC3\x95" => "O", "\xC3\x96" => "O", "\xC3\x98" => "O",
77 "\xC3\x99" => "U", "\xC3\x9A" => "U", "\xC3\x9B" => "U", "\xC3\x9C" => "U",
78 "\xC3\x9D" => "Y", "\xC3\x9E" => "P", "\xC3\x9F" => "ss",
79 "\xC3\xA0" => "a", "\xC3\xA1" => "a", "\xC3\xA2" => "a", "\xC3\xA3" => "a",
80 "\xC3\xA4" => "a", "\xC3\xA5" => "a", "\xC3\xA6" => "ae","\xC3\xA7" => "c",
81 "\xC3\xA8" => "e", "\xC3\xA9" => "e", "\xC3\xAA" => "e", "\xC3\xAB" => "e",
82 "\xC3\xAC" => "i", "\xC3\xAD" => "i", "\xC3\xAE" => "i", "\xC3\xAF" => "i",
83 "\xC3\xB0" => "o", "\xC3\xB1" => "n", "\xC3\xB2" => "o", "\xC3\xB3" => "o",
84 "\xC3\xB4" => "o", "\xC3\xB5" => "o", "\xC3\xB6" => "o", "\xC3\xB8" => "o",
85 "\xC3\xB9" => "u", "\xC3\xBA" => "u", "\xC3\xBB" => "u", "\xC3\xBC" => "u",
86 "\xC3\xBD" => "y", "\xC3\xBE" => "p", "\xC3\xBF" => "y",
87 "\xC4\x80" => "A", "\xC4\x81" => "a", "\xC4\x82" => "A", "\xC4\x83" => "a",
88 "\xC4\x84" => "A", "\xC4\x85" => "a", "\xC4\x86" => "C", "\xC4\x87" => "c",
89 "\xC4\x88" => "C", "\xC4\x89" => "c", "\xC4\x8A" => "C", "\xC4\x8B" => "c",
90 "\xC4\x8C" => "C", "\xC4\x8D" => "c", "\xC4\x8E" => "D", "\xC4\x8F" => "d",
91 "\xC4\x90" => "D", "\xC4\x91" => "d", "\xC4\x92" => "E", "\xC4\x93" => "e",
92 "\xC4\x94" => "E", "\xC4\x95" => "e", "\xC4\x96" => "E", "\xC4\x97" => "e",
93 "\xC4\x98" => "E", "\xC4\x99" => "e", "\xC4\x9A" => "E", "\xC4\x9B" => "e",
94 "\xC4\x9C" => "G", "\xC4\x9D" => "g", "\xC4\x9E" => "G", "\xC4\x9F" => "g",
95 "\xC4\xA0" => "G", "\xC4\xA1" => "g", "\xC4\xA2" => "G", "\xC4\xA3" => "g",
96 "\xC4\xA4" => "H", "\xC4\xA5" => "h", "\xC4\xA6" => "H", "\xC4\xA7" => "h",
97 "\xC4\xA8" => "I", "\xC4\xA9" => "i", "\xC4\xAA" => "I", "\xC4\xAB" => "i",
98 "\xC4\xAC" => "I", "\xC4\xAD" => "i", "\xC4\xAE" => "I", "\xC4\xAF" => "i",
99 "\xC4\xB0" => "I", "\xC4\xB1" => "i", "\xC4\xB2" => "IJ","\xC4\xB3" => "ij",
100 "\xC4\xB4" => "J", "\xC4\xB5" => "j", "\xC4\xB6" => "K", "\xC4\xB7" => "k",
101 "\xC4\xB8" => "k", "\xC4\xB9" => "L", "\xC4\xBA" => "l", "\xC4\xBB" => "L",
102 "\xC4\xBC" => "l", "\xC4\xBD" => "L", "\xC4\xBE" => "l", "\xC4\xBF" => "L",
103 "\xC5\x80" => "l", "\xC5\x81" => "L", "\xC5\x82" => "l", "\xC5\x83" => "N",
104 "\xC5\x84" => "n", "\xC5\x85" => "N", "\xC5\x86" => "n", "\xC5\x87" => "N",
105 "\xC5\x88" => "n", "\xC5\x89" => "n", "\xC5\x8A" => "N", "\xC5\x8B" => "n",
106 "\xC5\x8C" => "O", "\xC5\x8D" => "o", "\xC5\x8E" => "O", "\xC5\x8F" => "o",
107 "\xC5\x90" => "O", "\xC5\x91" => "o", "\xC5\x92" => "CE","\xC5\x93" => "ce",
108 "\xC5\x94" => "R", "\xC5\x95" => "r", "\xC5\x96" => "R", "\xC5\x97" => "r",
109 "\xC5\x98" => "R", "\xC5\x99" => "r", "\xC5\x9A" => "S", "\xC5\x9B" => "s",
110 "\xC5\x9C" => "S", "\xC5\x9D" => "s", "\xC5\x9E" => "S", "\xC5\x9F" => "s",
111 "\xC5\xA0" => "S", "\xC5\xA1" => "s", "\xC5\xA2" => "T", "\xC5\xA3" => "t",
112 "\xC5\xA4" => "T", "\xC5\xA5" => "t", "\xC5\xA6" => "T", "\xC5\xA7" => "t",
113 "\xC5\xA8" => "U", "\xC5\xA9" => "u", "\xC5\xAA" => "U", "\xC5\xAB" => "u",
114 "\xC5\xAC" => "U", "\xC5\xAD" => "u", "\xC5\xAE" => "U", "\xC5\xAF" => "u",
115 "\xC5\xB0" => "U", "\xC5\xB1" => "u", "\xC5\xB2" => "U", "\xC5\xB3" => "u",
116 "\xC5\xB4" => "W", "\xC5\xB5" => "w", "\xC5\xB6" => "Y", "\xC5\xB7" => "y",
117 "\xC5\xB8" => "Y", "\xC5\xB9" => "Z", "\xC5\xBA" => "z", "\xC5\xBB" => "Z",
118 "\xC5\xBC" => "z", "\xC5\xBD" => "Z", "\xC5\xBE" => "z", "\xC6\x8F" => "E",
119 "\xC6\xA0" => "O", "\xC6\xA1" => "o", "\xC6\xAF" => "U", "\xC6\xB0" => "u",
120 "\xC7\x8D" => "A", "\xC7\x8E" => "a", "\xC7\x8F" => "I",
121 "\xC7\x90" => "i", "\xC7\x91" => "O", "\xC7\x92" => "o", "\xC7\x93" => "U",
122 "\xC7\x94" => "u", "\xC7\x95" => "U", "\xC7\x96" => "u", "\xC7\x97" => "U",
123 "\xC7\x98" => "u", "\xC7\x99" => "U", "\xC7\x9A" => "u", "\xC7\x9B" => "U",
124 "\xC7\x9C" => "u",
125 "\xC7\xBA" => "A", "\xC7\xBB" => "a", "\xC7\xBC" => "AE","\xC7\xBD" => "ae",
126 "\xC7\xBE" => "O", "\xC7\xBF" => "o",
127 "\xC9\x99" => "e",
128 "\xC2\x82" => ",", # High code comma
129 "\xC2\x84" => ",,", # High code double comma
130 "\xC2\x85" => "...", # Tripple dot
131 "\xC2\x88" => "^", # High carat
132 "\xC2\x91" => "\x27", # Forward single quote
133 "\xC2\x92" => "\x27", # Reverse single quote
134 "\xC2\x93" => "\x22", # Forward double quote
135 "\xC2\x94" => "\x22", # Reverse double quote
136 "\xC2\x96" => "-", # High hyphen
137 "\xC2\x97" => "--", # Double hyphen
138 "\xC2\xA6" => "|", # Split vertical bar
139 "\xC2\xAB" => "<<", # Double less than
140 "\xC2\xBB" => ">>", # Double greater than
141 "\xC2\xBC" => "1/4", # one quarter
142 "\xC2\xBD" => "1/2", # one half
143 "\xC2\xBE" => "3/4", # three quarters
144 "\xCA\xBF" => "\x27", # c-single quote
145 "\xCC\xA8" => "", # modifier - under curve
146 "\xCC\xB1" => "", # modifier - under line
147 /\W/ => ""
148 }
70 149
71 end 150 end
72 151
73 # Extention of the standard class String with useful function. 152 # Extention of the standard class String with useful function.
74 class String 153 class String
75 include Mirimiri 154 include Mirimiri
155
156 def unaccent
157 # force_encoding is needed with ruby1.9
158 Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) }
159 end
76 160
77 # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise. 161 # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise.
78 def is_stopword? 162 def is_stopword?
79 Stoplist.include?(self.downcase) 163 Stoplist.include?(self.downcase)
80 end 164 end
81 165
82 # Do not use. 166 # Do not use.
83 # TODO: rewamp. find why this function is here. 167 # TODO: rewamp. find why this function is here.
84 def remove_special_characters 168 def remove_special_characters
85 self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ') 169 self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ')
86 end 170 end
87 171
88 # Removes all XML-like tags from +self+. 172 # Removes all XML-like tags from +self+.
89 # 173 #
90 # s = "<html><body>test</body></html>" 174 # s = "<html><body>test</body></html>"
91 # s.strip_xml_tags! 175 # s.strip_xml_tags!
92 # s #=> "test" 176 # s #=> "test"
93 def strip_xml_tags! 177 def strip_xml_tags!
94 replace strip_with_pattern /<\/?[^>]*>/ 178 replace strip_with_pattern /<\/?[^>]*>/
95 end 179 end
96 180
97 # Removes all XML-like tags from +self+. 181 # Removes all XML-like tags from +self+.
98 # 182 #
99 # s = "<html><body>test</body></html>" 183 # s = "<html><body>test</body></html>"
100 # s.strip_xml_tags #=> "test" 184 # s.strip_xml_tags #=> "test"
101 # s #=> "<html><body>test</body></html>" 185 # s #=> "<html><body>test</body></html>"
102 def strip_xml_tags 186 def strip_xml_tags
103 dup.strip_xml_tags! 187 dup.strip_xml_tags!
104 end 188 end
105 189
106 # Removes all Javascript sources from +self+. 190 # Removes all Javascript sources from +self+.
107 # 191 #
108 # s = "<script type='text/javascript'> 192 # s = "<script type='text/javascript'>
109 # var skin='vector', 193 # var skin='vector',
110 # stylepath='http://bits.wikimedia.org/skins-1.5' 194 # stylepath='http://bits.wikimedia.org/skins-1.5'
111 # </script> 195 # </script>
112 # 196 #
113 # test" 197 # test"
114 # s.strip_javascripts! 198 # s.strip_javascripts!
115 # s #=> "test" 199 # s #=> "test"
116 def strip_javascripts! 200 def strip_javascripts!
117 replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m 201 replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m
118 end 202 end
119 203
120 # Removes all Javascript sources from +self+. 204 # Removes all Javascript sources from +self+.
121 # 205 #
122 # s = "<script type='text/javascript'> 206 # s = "<script type='text/javascript'>
123 # var skin='vector', 207 # var skin='vector',
124 # stylepath='http://bits.wikimedia.org/skins-1.5' 208 # stylepath='http://bits.wikimedia.org/skins-1.5'
125 # </script> 209 # </script>
126 # 210 #
127 # test" 211 # test"
128 # s.strip_javascripts #=> "test" 212 # s.strip_javascripts #=> "test"
129 def strip_javascripts 213 def strip_javascripts
130 dup.strip_javascripts! 214 dup.strip_javascripts!
131 end 215 end
132 216
133 def strip_stylesheets! 217 def strip_stylesheets!
134 # TODO: rewamp. dunno what is it. 218 # TODO: rewamp. dunno what is it.
135 replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m 219 replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m
136 end 220 end
137 221
138 def strip_stylesheets 222 def strip_stylesheets
139 dup.strip_stylesheets! 223 dup.strip_stylesheets!
140 end 224 end
141 225
142 # Removes punctuation from +self+. 226 # Removes punctuation from +self+.
143 # 227 #
144 # s = "hello, world. how are you?!" 228 # s = "hello, world. how are you?!"
145 # s.strip_punctuation! 229 # s.strip_punctuation!
146 # s # => "hello world how are you" 230 # s # => "hello world how are you"
147 def strip_punctuation! 231 def strip_punctuation!
148 replace strip_with_pattern /[^a-zA-Z0-9\-\s]/ 232 replace strip_with_pattern /[^a-zA-Z0-9\-\s]/
149 end 233 end
150 234
151 # Removes punctuation from +self+. 235 # Removes punctuation from +self+.
152 # 236 #
153 # s = "hello, world. how are you?!" 237 # s = "hello, world. how are you?!"
154 # s.strip_punctuation # => "hello world how are you" 238 # s.strip_punctuation # => "hello world how are you"
155 def strip_punctuation 239 def strip_punctuation
156 dup.strip_punctuation! 240 dup.strip_punctuation!
157 end 241 end
158 242
159 # Returns the text values inside all occurences of a XML tag in +self+ 243 # Returns the text values inside all occurences of a XML tag in +self+
160 # 244 #
161 # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre" 245 # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre"
162 # s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"] 246 # s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"]
163 def extract_xmltags_values(tag_name) 247 def extract_xmltags_values(tag_name)
164 self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten 248 self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten
165 end 249 end
166 250
167 def strip_with_pattern(pattern) 251 def strip_with_pattern(pattern)
168 require 'cgi' 252 require 'cgi'
169 require 'kconv' 253 require 'kconv'
170 CGI::unescapeHTML(self.gsub(pattern,"")).toutf8 254 CGI::unescapeHTML(self.gsub(pattern,"")).toutf8
171 end 255 end
172 256
173 private :strip_with_pattern 257 private :strip_with_pattern
174 end 258 end
175 259