Deveaud Romain / mirimiri

Browse Code »

Commit 845768f8ac5a1593db356377fcc68208c12efa74

Authored by Romain Deveaud 2010-12-22 18:50:40 +0100

1 parent 175908fe2a

Exists in master

creating a group of indri queries is possible. added an accent stripping function.

Showing 4 changed files with 116 additions and 17 deletions Inline Diff

examples/entropy.rb
lib/mirimiri/document.rb
lib/mirimiri/query.rb
lib/mirimiri/string.rb

examples/entropy.rb

Diff comments View file @ 845768f

1	require 'rir'	1	require 'mirimiri'
2		2
3	# Concatenates all lines from one file, without \n	3	# Concatenates all lines from one file, without \n
4	readme = File.open('README.markdown').readlines.collect { \|l\| l.chomp }.join(" ")	4	readme = File.open('README.markdown').readlines.collect { \|l\| l.chomp }.join(" ")
5		5
6	# Creates the document with a string	6	# Creates the document with a string
7	doc = RIR::Document.new readme	7	doc = Mirimiri::Document.new readme
8		8
9	# Outputs all the unique words of the document with their entropy scores	9	# Outputs all the unique words of the document with their entropy scores
10	p doc.words.collect { \|w\| "#{w} => #{doc.entropy w}" }	10	p doc.words.collect { \|w\| "#{w} => #{doc.entropy w}" }
11		11

lib/mirimiri/document.rb

Diff comments View file @ 845768f

1	#!/usr/bin/env ruby	1	#!/usr/bin/env ruby
2		2
3	#--	3	#--
4	# This file is a part of the mirimiri library	4	# This file is a part of the mirimiri library
5	#	5	#
6	# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>	6	# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7	#	7	#
8	# This program is free software: you can redistribute it and/or modify	8	# This program is free software: you can redistribute it and/or modify
9	# it under the terms of the GNU General Public License as published by	9	# it under the terms of the GNU General Public License as published by
10	# the Free Software Foundation, either version 3 of the License, or	10	# the Free Software Foundation, either version 3 of the License, or
11	# (at your option) any later version.	11	# (at your option) any later version.
12	#	12	#
13	# This program is distributed in the hope that it will be useful,	13	# This program is distributed in the hope that it will be useful,
14	# but WITHOUT ANY WARRANTY; without even the implied warranty of	14	# but WITHOUT ANY WARRANTY; without even the implied warranty of
15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	# GNU General Public License for more details.	16	# GNU General Public License for more details.
17	#	17	#
18	# You should have received a copy of the GNU General Public License	18	# You should have received a copy of the GNU General Public License
19	# along with this program. If not, see <http://www.gnu.org/licenses/>.	19	# along with this program. If not, see <http://www.gnu.org/licenses/>.
20	#++	20	#++
21		21
22		22
23	# General module	23	# General module
24	module Mirimiri	24	module Mirimiri
25		25
26	# A Document is a bag of words and is constructed from a string.	26	# A Document is a bag of words and is constructed from a string.
27	class Document	27	class Document
28	attr_reader :words, :doc_content	28	attr_reader :words, :doc_content
29		29
30	# Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html	30	# Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
31	# and the \\W special escape).	31	# and the \\W special escape).
32	#	32	#
33	# Protected function, only meant to by called at the initialization.	33	# Protected function, only meant to by called at the initialization.
34	def format_words	34	def format_words
35	wo = []	35	wo = []
36		36
37	@doc_content.split.each do \|w\|	37	@doc_content.split.each do \|w\|
38	w.split(/\W/).each do \|sw\|	38	w.split(/\W/).each do \|sw\|
39	wo.push(sw.downcase) if sw =~ /[a-zA-Z]/	39	wo.push(sw.downcase) if sw =~ /[a-zA-Z]/
40	end	40	end
41	end	41	end
42		42
43	wo	43	wo
44	end	44	end
45		45
46	# Returns an Array containing the +n+-grams (words) from the current Document.	46	# Returns an Array containing the +n+-grams (words) from the current Document.
47	#	47	#
48	# ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]	48	# ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
49	def ngrams(n)	49	def ngrams(n)
50	window = []	50	window = []
51	ngrams_array = []	51	ngrams_array = []
52		52
53	@words.each do \|w\|	53	@words.each do \|w\|
54	window.push(w)	54	window.push(w)
55	if window.size == n	55	if window.size == n
56	ngrams_array.push window.join(" ")	56	ngrams_array.push window.join(" ")
57	window.delete_at(0)	57	window.delete_at(0)
58	end	58	end
59	end	59	end
60		60
61	ngrams_array.uniq	61	ngrams_array.uniq
62	end	62	end
63		63
64	# Returns a Hash containing the words and their associated counts in the current Document.	64	# Returns a Hash containing the words and their associated counts in the current Document.
65	#	65	#
66	# count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }	66	# count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
67	def count_words	67	def count_words
68	counts = Hash.new { \|h,k\| h[k] = 0 }	68	counts = Hash.new { \|h,k\| h[k] = 0 }
69	@words.each { \|w\| counts[w] += 1 }	69	@words.each { \|w\| counts[w] += 1 }
70		70
71	counts	71	counts
72	end	72	end
73		73
74	# Computes the entropy of a given string +s+ inside the document.	74	# Computes the entropy of a given string +s+ inside the document.
75	#	75	#
76	# If the string parameter is composed of many words (i.e. tokens separated	76	# If the string parameter is composed of many words (i.e. tokens separated
77	# by whitespace(s)), it is considered as an ngram.	77	# by whitespace(s)), it is considered as an ngram.
78	#	78	#
79	# entropy("guitar") #=> 0.00432114812727959	79	# entropy("guitar") #=> 0.00432114812727959
80	# entropy("dillinger escape plan") #=> 0.265862076325102	80	# entropy("dillinger escape plan") #=> 0.265862076325102
81	def entropy(s)	81	def entropy(s)
82	en = 0.0	82	en = 0.0
83	counts = self.count_words	83	counts = self.count_words
84		84
85	s.split.each do \|w\|	85	s.split.each do \|w\|
86	p_wi = counts[w].to_f/@words.count.to_f	86	p_wi = counts[w].to_f/@words.count.to_f
87	en += p_wi*Math.log2(p_wi)	87	en += p_wi*Math.log2(p_wi)
88	end	88	end
89		89
90	en *= -1	90	en *= -1
91	en	91	en
92	end	92	end
93		93
94	# Computes the term frequency of a given word +s+.	94	# Computes the term frequency of a given word +s+.
95	#	95	#
96	# tf("guitar") #=> 0.000380372765310004	96	# tf("guitar") #=> 0.000380372765310004
97	def tf(s)	97	def tf(s)
98	self.count_words[s].to_f/@words.size.to_f	98	self.count_words[s].to_f/@words.size.to_f
99	end	99	end
100		100
101		101
102	def initialize(content)	102	def initialize(content="")
103	@doc_content = content	103	@doc_content = content
104	@words = format_words	104	@words = format_words
105	end	105	end
106		106
107	protected :format_words	107	protected :format_words
108	end	108	end
109		109
110	# A WebDocument is a Document with a +url+.	110	# A WebDocument is a Document with a +url+.
111	class WebDocument < Document	111	class WebDocument < Document
112	attr_reader :url	112	attr_reader :url
113		113
114	# Returns the HTML text from the page of a given +url+.	114	# Returns the HTML text from the page of a given +url+.
115	def self.get_content(url)	115	def self.get_content(url)
116	require 'net/http'	116	require 'net/http'
117	Net::HTTP.get(URI.parse(url))	117	Net::HTTP.get(URI.parse(url))
118	end	118	end
119		119
120	# WebDocument constructor, the content of the Document is the HTML page	120	# WebDocument constructor, the content of the Document is the HTML page
121	# without the tags.	121	# without the tags.
122	def initialize(url)	122	def initialize(url)
123	@url = url	123	@url = url
124	super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags	124	super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags
125	end	125	end
126	end	126	end
127		127
128	# A WikipediaPage is a WebDocument.	128	# A WikipediaPage is a WebDocument.
129	class WikipediaPage < WebDocument	129	class WikipediaPage < WebDocument
130	require 'rexml/document'	130	require 'rexml/document'
131	require 'net/http'	131	require 'net/http'
132	require 'kconv'	132	require 'kconv'
133		133
134		134
135	def self.search_wikipedia_titles(name)	135	def self.search_wikipedia_titles(name)
136	raise ArgumentError, "Bad encoding", name unless name.isutf8	136	raise ArgumentError, "Bad encoding", name unless name.isutf8
137		137
138	res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search']	138	res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search']
139		139
140	res.collect { \|e\| e.attributes['title'] } unless res.nil?	140	res.collect { \|e\| e.attributes['title'] } unless res.nil?
141	end	141	end
142		142
143	def self.get_url(name)	143	def self.get_url(name)
144	raise ArgumentError, "Bad encoding", name unless name.isutf8	144	raise ArgumentError, "Bad encoding", name unless name.isutf8
145		145
146	atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes	146	atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes
147		147
148	atts['fullurl'] if atts['missing'].nil?	148	atts['fullurl'] if atts['missing'].nil?
149	end	149	end
150		150
151	def self.search_homepage(name)	151	def self.search_homepage(name)
152	title = WikipediaPage.search_wikipedia_titles name	152	title = WikipediaPage.search_wikipedia_titles name
153		153
154	WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? \|\| title.empty?	154	WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? \|\| title.empty?
155	end	155	end
156		156
157	# def initialize(name)	157	# def initialize(name)
158	# title = WikipediaPage.search_wikipedia_titles name	158	# title = WikipediaPage.search_wikipedia_titles name
159	# raise ArgumentError, "No page found" if title.empty?	159	# raise ArgumentError, "No page found" if title.empty?
160	# super WikipediaPage.get_url title[0]	160	# super WikipediaPage.get_url title[0]
161	# end	161	# end
162	end	162	end
163	end	163	end
164		164

lib/mirimiri/query.rb

Diff comments View file @ 845768f

 #!/usr/bin/env ruby
 #--
 # This file is a part of the mirimiri library
 #
 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #++
 class Query
 end
 module Indri
   class Parameters
     attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline
-    def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false)
+    def initialize(corpus,count="1000",mem="1g",offset="1",run_id="default",print_query=false,print_docs=false)
       @index_path  = corpus
       @memory      = mem
       @count       = count
       @offset      = offset
       @run_id      = run_id
       @print_query = print_query ? "true" : "false"
       @print_docs  = print_docs  ? "true" : "false"
     end
     def to_s
-      h = "<parameters>\n"
+      h = "<memory>#{@memory}</memory>\n"
-      h += "<memory>#{@memory}</memory>\n"
       h += "<index>#{@index_path}</index>\n"
       h += "<count>#{@count}</count>\n"
       unless @baseline.nil?
         h += "<baseline>#{@baseline}</baseline>\n"
       else
         h += "<rule>#{@rule}</rule>\n"
       end
+      h += "<trecFormat>true</trecFormat>\n"
       h += "<queryOffset>#{@offset}</queryOffset>\n"
       h += "<runID>#{@run_id}</runID>\n"
       h += "<printQuery>#{@print_query}</printQuery>\n"
       h += "<printDocuments>#{@print_docs}</printDocuments>\n"
       h
     end
   end
   class IndriQuery < Query
-    attr_accessor :id, :query, :params, :rule
+    attr_accessor :id, :query, :rule
-    def initialize(id,query,params)
+    def initialize(id,query)
-      @params = params
-      # Here we set the default retrieval model as Language Modeling
-      # with a Dirichlet smoothing at 2500.
-      # TODO: maybe a Rule class...
-      @params.rule  = 'method:dirichlet,mu:2500' if @params.rule.nil?
       @id     = id
       @query  = query
     end
     def to_s
-      h = @params.to_s
+      h = "<query>\n"
-      h += "<query>\n"
       h += "<number>#{@id}</number>\n"
       h += "<text>#{@query}</text>\n"
       h += "</query>\n"
+      h
+    end
+  end
+  class IndriQueries
+    attr_accessor :params, :queries
+    def initialize(params,*queries)
+      @queries = queries
+      @params = params
+      # Here we set the default retrieval model as Language Modeling
+      # with a Dirichlet smoothing at 2500.
+      # TODO: maybe a Rule class...
+      @params.rule  = 'method:dirichlet,mu:2500' if @params.rule.nil?
+    end
+    def to_s
+      h = "<parameters>\n"
+      h += @params.to_s
+      h += @queries.collect { |q| q.to_s }.join ""

lib/mirimiri/string.rb

Diff comments View file @ 845768f

 #!/usr/bin/env ruby
 #--
 # This file is a part of the mirimiri library
 #
 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #++
 module Mirimiri
   # These are the default stopwords provided by Lemur.
   Stoplist = [
   "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av",
   "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand",
   "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by",
   "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu",
   "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during",
   "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every",
   "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting",
   "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff",
   "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore",
   "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he",
   "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto",
   "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto",
   "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include",
   "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into",
   "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last",
   "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe",
   "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs",
   "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless",
   "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing",
   "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once",
   "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours",
   "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite",
   "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing",
   "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt",
   "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote",
   "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes",
   "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave",
   "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them",
   "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts",
   "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon",
   "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru",
   "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh",
   "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward",
   "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week",
   "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever",
   "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore",
   "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto",
   "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever",
   "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom",
   "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within",
   "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your",
   "yours", "yourself", "yourselves"
   ]
+  Transmap = {
+  "\xC3\x80" => "A", "\xC3\x81" => "A", "\xC3\x82" => "A", "\xC3\x83" => "A",
+  "\xC3\x84" => "A", "\xC3\x85" => "A", "\xC3\x86" => "AE","\xC3\x87" => "C",
+  "\xC3\x88" => "E", "\xC3\x89" => "E", "\xC3\x8A" => "E", "\xC3\x8B" => "E",
+  "\xC3\x8C" => "I", "\xC3\x8D" => "I", "\xC3\x8E" => "I", "\xC3\x8F" => "I",
+  "\xC3\x90" => "D", "\xC3\x91" => "N", "\xC3\x92" => "O", "\xC3\x93" => "O",
+  "\xC3\x94" => "O", "\xC3\x95" => "O", "\xC3\x96" => "O", "\xC3\x98" => "O",
+  "\xC3\x99" => "U", "\xC3\x9A" => "U", "\xC3\x9B" => "U", "\xC3\x9C" => "U",
+  "\xC3\x9D" => "Y", "\xC3\x9E" => "P", "\xC3\x9F" => "ss",
+  "\xC3\xA0" => "a", "\xC3\xA1" => "a", "\xC3\xA2" => "a", "\xC3\xA3" => "a",
+  "\xC3\xA4" => "a", "\xC3\xA5" => "a", "\xC3\xA6" => "ae","\xC3\xA7" => "c",
+  "\xC3\xA8" => "e", "\xC3\xA9" => "e", "\xC3\xAA" => "e", "\xC3\xAB" => "e",
+  "\xC3\xAC" => "i", "\xC3\xAD" => "i", "\xC3\xAE" => "i", "\xC3\xAF" => "i",
+  "\xC3\xB0" => "o", "\xC3\xB1" => "n", "\xC3\xB2" => "o", "\xC3\xB3" => "o",
+  "\xC3\xB4" => "o", "\xC3\xB5" => "o", "\xC3\xB6" => "o", "\xC3\xB8" => "o",
+  "\xC3\xB9" => "u", "\xC3\xBA" => "u", "\xC3\xBB" => "u", "\xC3\xBC" => "u",
+  "\xC3\xBD" => "y", "\xC3\xBE" => "p", "\xC3\xBF" => "y",
+  "\xC4\x80" => "A", "\xC4\x81" => "a", "\xC4\x82" => "A", "\xC4\x83" => "a",
+  "\xC4\x84" => "A", "\xC4\x85" => "a", "\xC4\x86" => "C", "\xC4\x87" => "c",
+  "\xC4\x88" => "C", "\xC4\x89" => "c", "\xC4\x8A" => "C", "\xC4\x8B" => "c",
+  "\xC4\x8C" => "C", "\xC4\x8D" => "c", "\xC4\x8E" => "D", "\xC4\x8F" => "d",
+  "\xC4\x90" => "D", "\xC4\x91" => "d", "\xC4\x92" => "E", "\xC4\x93" => "e",
+  "\xC4\x94" => "E", "\xC4\x95" => "e", "\xC4\x96" => "E", "\xC4\x97" => "e",
+  "\xC4\x98" => "E", "\xC4\x99" => "e", "\xC4\x9A" => "E", "\xC4\x9B" => "e",
+  "\xC4\x9C" => "G", "\xC4\x9D" => "g", "\xC4\x9E" => "G", "\xC4\x9F" => "g",
+  "\xC4\xA0" => "G", "\xC4\xA1" => "g", "\xC4\xA2" => "G", "\xC4\xA3" => "g",
+  "\xC4\xA4" => "H", "\xC4\xA5" => "h", "\xC4\xA6" => "H", "\xC4\xA7" => "h",
+  "\xC4\xA8" => "I", "\xC4\xA9" => "i", "\xC4\xAA" => "I", "\xC4\xAB" => "i",
+  "\xC4\xAC" => "I", "\xC4\xAD" => "i", "\xC4\xAE" => "I", "\xC4\xAF" => "i",
+  "\xC4\xB0" => "I", "\xC4\xB1" => "i", "\xC4\xB2" => "IJ","\xC4\xB3" => "ij",
+  "\xC4\xB4" => "J", "\xC4\xB5" => "j", "\xC4\xB6" => "K", "\xC4\xB7" => "k",
+  "\xC4\xB8" => "k", "\xC4\xB9" => "L", "\xC4\xBA" => "l", "\xC4\xBB" => "L",
+  "\xC4\xBC" => "l", "\xC4\xBD" => "L", "\xC4\xBE" => "l", "\xC4\xBF" => "L",
+  "\xC5\x80" => "l", "\xC5\x81" => "L", "\xC5\x82" => "l", "\xC5\x83" => "N",
+  "\xC5\x84" => "n", "\xC5\x85" => "N", "\xC5\x86" => "n", "\xC5\x87" => "N",
+  "\xC5\x88" => "n", "\xC5\x89" => "n", "\xC5\x8A" => "N", "\xC5\x8B" => "n",
+  "\xC5\x8C" => "O", "\xC5\x8D" => "o", "\xC5\x8E" => "O", "\xC5\x8F" => "o",
+  "\xC5\x90" => "O", "\xC5\x91" => "o", "\xC5\x92" => "CE","\xC5\x93" => "ce",
+  "\xC5\x94" => "R", "\xC5\x95" => "r", "\xC5\x96" => "R", "\xC5\x97" => "r",
+  "\xC5\x98" => "R", "\xC5\x99" => "r", "\xC5\x9A" => "S", "\xC5\x9B" => "s",
+  "\xC5\x9C" => "S", "\xC5\x9D" => "s", "\xC5\x9E" => "S", "\xC5\x9F" => "s",
+  "\xC5\xA0" => "S", "\xC5\xA1" => "s", "\xC5\xA2" => "T", "\xC5\xA3" => "t",
+  "\xC5\xA4" => "T", "\xC5\xA5" => "t", "\xC5\xA6" => "T", "\xC5\xA7" => "t",
+  "\xC5\xA8" => "U", "\xC5\xA9" => "u", "\xC5\xAA" => "U", "\xC5\xAB" => "u",
+  "\xC5\xAC" => "U", "\xC5\xAD" => "u", "\xC5\xAE" => "U", "\xC5\xAF" => "u",
+  "\xC5\xB0" => "U", "\xC5\xB1" => "u", "\xC5\xB2" => "U", "\xC5\xB3" => "u",
+  "\xC5\xB4" => "W", "\xC5\xB5" => "w", "\xC5\xB6" => "Y", "\xC5\xB7" => "y",
+  "\xC5\xB8" => "Y", "\xC5\xB9" => "Z", "\xC5\xBA" => "z", "\xC5\xBB" => "Z",
+  "\xC5\xBC" => "z", "\xC5\xBD" => "Z", "\xC5\xBE" => "z", "\xC6\x8F" => "E",
+  "\xC6\xA0" => "O", "\xC6\xA1" => "o", "\xC6\xAF" => "U", "\xC6\xB0" => "u",
+  "\xC7\x8D" => "A", "\xC7\x8E" => "a", "\xC7\x8F" => "I",
+  "\xC7\x90" => "i", "\xC7\x91" => "O", "\xC7\x92" => "o", "\xC7\x93" => "U",
+  "\xC7\x94" => "u", "\xC7\x95" => "U", "\xC7\x96" => "u", "\xC7\x97" => "U",
+  "\xC7\x98" => "u", "\xC7\x99" => "U", "\xC7\x9A" => "u", "\xC7\x9B" => "U",
+  "\xC7\x9C" => "u",
+  "\xC7\xBA" => "A", "\xC7\xBB" => "a", "\xC7\xBC" => "AE","\xC7\xBD" => "ae",
+  "\xC7\xBE" => "O", "\xC7\xBF" => "o",
+  "\xC9\x99" => "e",
+  "\xC2\x82" => ",",        # High code comma
+  "\xC2\x84" => ",,",       # High code double comma
+  "\xC2\x85" => "...",      # Tripple dot
+  "\xC2\x88" => "^",        # High carat
+  "\xC2\x91" => "\x27",     # Forward single quote
+  "\xC2\x92" => "\x27",     # Reverse single quote
+  "\xC2\x93" => "\x22",     # Forward double quote
+  "\xC2\x94" => "\x22",     # Reverse double quote
+  "\xC2\x96" => "-",        # High hyphen
+  "\xC2\x97" => "--",       # Double hyphen
+  "\xC2\xA6" => "|",        # Split vertical bar
+  "\xC2\xAB" => "<<",       # Double less than
+  "\xC2\xBB" => ">>",       # Double greater than
+  "\xC2\xBC" => "1/4",      # one quarter
+  "\xC2\xBD" => "1/2",      # one half
+  "\xC2\xBE" => "3/4",      # three quarters
+  "\xCA\xBF" => "\x27",     # c-single quote
+  "\xCC\xA8" => "",         # modifier - under curve
+  "\xCC\xB1" => "",         # modifier - under line
+  /\W/ => ""
+  }
 end
 # Extention of the standard class String with useful function.
 class String
   include Mirimiri
+  def unaccent
+    # force_encoding is needed with ruby1.9
+    Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) }
+  end
   # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise.
   def is_stopword?
     Stoplist.include?(self.downcase)
   end
   # Do not use.
   # TODO: rewamp. find why this function is here.
   def remove_special_characters
     self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ')
   end
   # Removes all XML-like tags from +self+.
   #
   #   s = "<html><body>test</body></html>"
   #   s.strip_xml_tags!
   #   s                                     #=> "test"
   def strip_xml_tags!
     replace strip_with_pattern /<\/?[^>]*>/
   end
   # Removes all XML-like tags from +self+.
   #
   #   s = "<html><body>test</body></html>"
   #   s.strip_xml_tags                      #=> "test"
   #   s                                     #=> "<html><body>test</body></html>"
   def strip_xml_tags
     dup.strip_xml_tags!
   end
   # Removes all Javascript sources from +self+.
   #
   #   s = "<script type='text/javascript'>
   #         var skin='vector',
   #         stylepath='http://bits.wikimedia.org/skins-1.5'
   #        </script>
   #
   #        test"
   #   s.strip_javascripts!
   #   s                                     #=> "test"
   def strip_javascripts!
     replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m
   end
   # Removes all Javascript sources from +self+.
   #
   #   s = "<script type='text/javascript'>
   #         var skin='vector',
   #         stylepath='http://bits.wikimedia.org/skins-1.5'
   #        </script>
   #
   #        test"
   #   s.strip_javascripts                   #=> "test"
   def strip_javascripts
     dup.strip_javascripts!
   end
   def strip_stylesheets!
   # TODO: rewamp. dunno what is it.
     replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m
   end
   def strip_stylesheets
     dup.strip_stylesheets!
   end
   # Removes punctuation from +self+.
   #
   #   s = "hello, world. how are you?!"
   #   s.strip_punctuation!
   #   s                                 # => "hello world how are you"
   def strip_punctuation!
     replace strip_with_pattern /[^a-zA-Z0-9\-\s]/
   end
   # Removes punctuation from +self+.
   #
   #   s = "hello, world. how are you?!"
   #   s.strip_punctuation               # => "hello world how are you"
   def strip_punctuation
     dup.strip_punctuation!
   end
   # Returns the text values inside all occurences of a XML tag in +self+
   #
   #   s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre"
   #   s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"]
   def extract_xmltags_values(tag_name)
     self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten
   end
   def strip_with_pattern(pattern)
     require 'cgi'
     require 'kconv'
     CGI::unescapeHTML(self.gsub(pattern,"")).toutf8
   end
   private :strip_with_pattern
 end