Deveaud Romain / mirimiri

Browse Code »

Commit ca96fb31f8d5fe261716907d190363d6492d29ff

Authored by romain 2011-04-01 11:17:17 +0200

1 parent b55f47b385

Exists in master

exec method for Indri

Showing 3 changed files with 52 additions and 46 deletions Inline Diff

lib/mirimiri/document.rb
lib/mirimiri/query.rb
lib/mirimiri/string.rb

lib/mirimiri/document.rb

Diff comments View file @ ca96fb3

1	#!/usr/bin/env ruby	1	#!/usr/bin/env ruby
2		2
3	#--	3	#--
4	# This file is a part of the mirimiri library	4	# This file is a part of the mirimiri library
5	#	5	#
6	# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>	6	# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7	#	7	#
8	# This program is free software: you can redistribute it and/or modify	8	# This program is free software: you can redistribute it and/or modify
9	# it under the terms of the GNU General Public License as published by	9	# it under the terms of the GNU General Public License as published by
10	# the Free Software Foundation, either version 3 of the License, or	10	# the Free Software Foundation, either version 3 of the License, or
11	# (at your option) any later version.	11	# (at your option) any later version.
12	#	12	#
13	# This program is distributed in the hope that it will be useful,	13	# This program is distributed in the hope that it will be useful,
14	# but WITHOUT ANY WARRANTY; without even the implied warranty of	14	# but WITHOUT ANY WARRANTY; without even the implied warranty of
15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	# GNU General Public License for more details.	16	# GNU General Public License for more details.
17	#	17	#
18	# You should have received a copy of the GNU General Public License	18	# You should have received a copy of the GNU General Public License
19	# along with this program. If not, see <http://www.gnu.org/licenses/>.	19	# along with this program. If not, see <http://www.gnu.org/licenses/>.
20	#++	20	#++
21		21
22		22
23	# General module	23	# General module
24	module Mirimiri	24	module Mirimiri
25		25
26	# A Document is a bag of words and is constructed from a string.	26	# A Document is a bag of words and is constructed from a string.
27	class Document	27	class Document
28	attr_reader :words, :doc_content, :count_words	28	attr_reader :words, :doc_content, :count_words
29		29
30	# Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html	30	# Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
31	# and the \\W special escape).	31	# and the \\W special escape).
32	#	32	#
33	# Protected function, only meant to by called at the initialization.	33	# Protected function, only meant to by called at the initialization.
34	def format_words	34	def format_words
35	wo = []	35	wo = []
36		36
37	@doc_content.split.each do \|w\|	37	@doc_content.split.each do \|w\|
38	w.split(/\W/).each do \|sw\|	38	w.split(/\W/).each do \|sw\|
39	wo.push(sw.downcase) if sw =~ /[a-zA-Z]/	39	wo.push(sw.downcase) if sw =~ /[a-zA-Z]/
40	end	40	end
41	end	41	end
42		42
43	wo	43	wo
44	end	44	end
45		45
46	# Returns an Array containing the +n+-grams (words) from the current Document.	46	# Returns an Array containing the +n+-grams (words) from the current Document.
47	#	47	#
48	# ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]	48	# ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
49	def ngrams(n)	49	def ngrams(n)
50	window = []	50	window = []
51	ngrams_array = []	51	ngrams_array = []
52		52
53	@words.each do \|w\|	53	@words.each do \|w\|
54	window.push(w)	54	window.push(w)
55	if window.size == n	55	if window.size == n
56	ngrams_array.push window.join(" ")	56	ngrams_array.push window.join(" ")
57	window.delete_at(0)	57	window.delete_at(0)
58	end	58	end
59	end	59	end
60		60
61	ngrams_array.uniq	61	ngrams_array.uniq
62	end	62	end
63		63
64	# Returns a Hash containing the words and their associated counts in the current Document.	64	# Returns a Hash containing the words and their associated counts in the current Document.
65	#	65	#
66	# count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }	66	# count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
67	def count_words	67	def count_words
68	counts = Hash.new { \|h,k\| h[k] = 0 }	68	counts = Hash.new { \|h,k\| h[k] = 0 }
69	@words.each { \|w\| counts[w] += 1 }	69	@words.each { \|w\| counts[w] += 1 }
70		70
71	counts	71	counts
72	end	72	end
73		73
74	# Computes the entropy of a given string +s+ inside the document.	74	# Computes the entropy of a given string +s+ inside the document.
75	#	75	#
76	# If the string parameter is composed of many words (i.e. tokens separated	76	# If the string parameter is composed of many words (i.e. tokens separated
77	# by whitespace(s)), it is considered as an ngram.	77	# by whitespace(s)), it is considered as an ngram.
78	#	78	#
79	# entropy("guitar") #=> 0.00432114812727959	79	# entropy("guitar") #=> 0.00432114812727959
80	# entropy("dillinger escape plan") #=> 0.265862076325102	80	# entropy("dillinger escape plan") #=> 0.265862076325102
81	def entropy(s)	81	def entropy(s)
82	en = 0.0	82	en = 0.0
83		83
84	s.split.each do \|w\|	84	s.split.each do \|w\|
85	p_wi = @count_words[w].to_f/@words.count.to_f	85	p_wi = @count_words[w].to_f/@words.count.to_f
86	en += p_wi*Math.log2(p_wi)	86	en += p_wi*Math.log2(p_wi)
87	end	87	end
88		88
89	en *= -1	89	en *= -1
90	en	90	en
91	end	91	end
92		92
93	# Computes the term frequency of a given word +s+.	93	# Computes the term frequency of a given word +s+.
94	#	94	#
95	# tf("guitar") #=> 0.000380372765310004	95	# tf("guitar") #=> 0.000380372765310004
96	def tf(s)	96	def tf(s)
97	@count_words[s].to_f/@words.size.to_f	97	@count_words[s].to_f/@words.size.to_f
98	end	98	end
99		99
100		100
101	def initialize(content="")	101	def initialize(content="")
102	@doc_content = content	102	@doc_content = content
103	@words = format_words	103	@words = format_words
104	@count_words = count_words	104	@count_words = count_words
105	end	105	end
106		106
107	protected :format_words, :count_words	107	protected :format_words, :count_words
108	end	108	end
109		109
110	# A WebDocument is a Document with a +url+.	110	# A WebDocument is a Document with a +url+.
111	class WebDocument < Document	111	class WebDocument < Document
112	attr_reader :url	112	attr_reader :url
113		113
114	# Returns the HTML text from the page of a given +url+.	114	# Returns the HTML text from the page of a given +url+.
115	def self.get_content(url)	115	def self.get_content(url)
116	require 'net/http'	116	require 'net/http'
117	Net::HTTP.get(URI.parse(url))	117	Net::HTTP.get(URI.parse(url))
118	end	118	end
119		119
120	# WebDocument constructor, the content of the Document is the HTML page	120	# WebDocument constructor, the content of the Document is the HTML page
121	# without the tags.	121	# without the tags.
122	def initialize(url,only_tags=nil)	122	def initialize(url,only_tags=nil)
123	@url = url	123	@url = url
124	content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")	124	content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")
125	super content.strip_javascripts.strip_xml_tags	125	super content.strip_javascripts.strip_xml_tags
126	end	126	end
127	end	127	end
128		128
129	# A WikipediaPage is a WebDocument.	129	# A WikipediaPage is a WebDocument.
130	class WikipediaPage < WebDocument	130	class WikipediaPage < WebDocument
131	require 'rexml/document'	131	require 'rexml/document'
132	require 'net/http'	132	require 'net/http'
133	require 'kconv'	133	require 'kconv'
134		134
135		135
136	def self.search_wikipedia_titles(name)	136	def self.search_wikipedia_titles(name)
137	raise ArgumentError, "Bad encoding", name unless name.isutf8	137	raise ArgumentError, "Bad encoding", name unless name.isutf8
138		138
139	res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).unaccent.toutf8).elements['api/query/search']	139	res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).unaccent.toutf8).elements['api/query/search']
140		140
141	res.collect { \|e\| e.attributes['title'] } unless res.nil?	141	res.collect { \|e\| e.attributes['title'] } unless res.nil?
142	end	142	end
143		143
144	def self.get_url(name)	144	def self.get_url(name)
145	raise ArgumentError, "Bad encoding", name unless name.isutf8	145	raise ArgumentError, "Bad encoding", name unless name.isutf9
146		146
147	atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).unaccent.toutf8).elements['api/query/pages/page'].attributes	147	atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).unaccent.toutf8).elements['api/query/pages/page'].attributes
148		148
149	atts['fullurl'] if atts['missing'].nil?	149	atts['fullurl'] if atts['missing'].nil?
150	end	150	end
151		151
152	def self.search_homepage(name)	152	def self.search_homepage(name)
153	title = WikipediaPage.search_wikipedia_titles name	153	title = WikipediaPage.search_wikipedia_titles name
154		154
155	WikipediaPage.get_url(title[0]) unless title.nil? \|\| title.empty?	155	WikipediaPage.get_url(title[0]) unless title.nil? \|\| title.empty?
156	end	156	end
157		157
158	end	158	end
159	end	159	end
160		160

lib/mirimiri/query.rb

Diff comments View file @ ca96fb3

 #!/usr/bin/env ruby
 #--
 # This file is a part of the mirimiri library
 #
 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #++
 class Query
 end
 module Indri
   class Parameters
     attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline
     def initialize(corpus,count="1000",mem="1g",offset="1",run_id="default",print_query=false,print_docs=false)
       @index_path  = corpus
       @memory      = mem
       @count       = count
       @offset      = offset
       @run_id      = run_id
       @print_query = print_query ? "true" : "false"
       @print_docs  = print_docs  ? "true" : "false"
     end
     def to_s
       h = "<memory>#{@memory}</memory>\n"
       h += "<index>#{@index_path}</index>\n"
       h += "<count>#{@count}</count>\n"
       unless @baseline.nil?
         h += "<baseline>#{@baseline}</baseline>\n"
       else
         h += "<rule>#{@rule}</rule>\n"
       end
       h += "<trecFormat>true</trecFormat>\n"
       h += "<queryOffset>#{@offset}</queryOffset>\n"
       h += "<runID>#{@run_id}</runID>\n"
       h += "<printQuery>#{@print_query}</printQuery>\n"
       h += "<printDocuments>#{@print_docs}</printDocuments>\n"
       h
     end
   end
   class IndriQuery < Query
     attr_accessor :id, :query, :rule
     def initialize(id,query)
       @id     = id
       @query  = query
     end
     def to_s
       h = "<query>\n"
       h += "<number>#{@id}</number>\n"
       h += "<text>#{@query}</text>\n"
       h += "</query>\n"
       h
     end
+    def exec params
+      `IndriRunQuery -query='#{@query}' -index=#{params.index_path} -count=#{params.count} -rule=method:dirichlet,mu:2500 -trecFormat`
+    end
   end
   class IndriQueries
     attr_accessor :params, :queries
     def initialize(params,*queries)
       @queries = queries
       @params = params
       # Here we set the default retrieval model as Language Modeling
       # with a Dirichlet smoothing at 2500.
       # TODO: maybe a Rule class...
       @params.rule  = 'method:dirichlet,mu:2500' if @params.rule.nil?
     end
     def to_s
       h = "<parameters>\n"
       h += @params.to_s
       h += @queries.collect { |q| q.to_s }.join ""
       h += "</parameters>"
       h
     end
   end
 end

lib/mirimiri/string.rb

Diff comments View file @ ca96fb3

1	#!/usr/bin/env ruby	1	#!/usr/bin/env ruby
2		2
3	#--	3	#--
4	# This file is a part of the mirimiri library	4	# This file is a part of the mirimiri library
5	#	5	#
6	# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>	6	# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7	#	7	#
8	# This program is free software: you can redistribute it and/or modify	8	# This program is free software: you can redistribute it and/or modify
9	# it under the terms of the GNU General Public License as published by	9	# it under the terms of the GNU General Public License as published by
10	# the Free Software Foundation, either version 3 of the License, or	10	# the Free Software Foundation, either version 3 of the License, or
11	# (at your option) any later version.	11	# (at your option) any later version.
12	#	12	#
13	# This program is distributed in the hope that it will be useful,	13	# This program is distributed in the hope that it will be useful,
14	# but WITHOUT ANY WARRANTY; without even the implied warranty of	14	# but WITHOUT ANY WARRANTY; without even the implied warranty of
15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	# GNU General Public License for more details.	16	# GNU General Public License for more details.
17	#	17	#
18	# You should have received a copy of the GNU General Public License	18	# You should have received a copy of the GNU General Public License
19	# along with this program. If not, see <http://www.gnu.org/licenses/>.	19	# along with this program. If not, see <http://www.gnu.org/licenses/>.
20	#++	20	#++
21		21
22	module Mirimiri	22	module Mirimiri
23		23
24	# These are the default stopwords provided by Lemur.	24	# These are the default stopwords provided by Lemur.
25	Stoplist = [	25	Stoplist = [
26	"a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av",	26	"a","about","above","according","across","after","afterwards","again","against",
27	"be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand",	27	"albeit","all","almost","alone","along","already","also","although","always","am",
28	"behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by",	28	"among","amongst","an","and","another","any","anybody","anyhow","anyone","anything",
29	"can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu",	29	"anyway","anywhere","apart","are","around","as","at","av","be","became","because",
30	"day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during",	30	"become","becomes","becoming","been","before","beforehand","behind","being","below",
31	"each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every",	31	"beside","besides","between","beyond","both","but","by","can","cannot","canst",
32	"everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting",	32	"certain","cf","choose","contrariwise","cos","could","cu","day","do","does","doesn't",
33	"exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff",	33	"doing","dost","doth","double","down","dual","during","each","either","else",
34	"first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore",	34	"elsewhere","enough","et","etc","even","ever","every","everybody","everyone",
35	"furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he",	35	"everything","everywhere","except","excepted","excepting","exception","exclude",
36	"hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto",	36	"excluding","exclusive","far","farther","farthest","few","ff","first","for",
37	"hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto",	37	"formerly","forth","forward","from","front","further","furthermore","furthest","get",
38	"how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include",	38	"go","had","halves","hardly","has","hast","hath","have","he","hence","henceforth",
39	"included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into",	39	"her","here","hereabouts","hereafter","hereby","herein","hereto","hereupon","hers",
40	"inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last",	40	"herself","him","himself","hindmost","his","hither","hitherto","how","however",
41	"latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe",	41	"howsoever","i","ie","if","in","inasmuch","inc","include","included","including",
42	"me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs",	42	"indeed","indoors","inside","insomuch","instead","into","inward","inwards","is",
43	"ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless",	43	"it","its","itself","just","kind","kg","km","last","latter","latterly","less","lest",
44	"next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing",	44	"let","like","little","ltd","many","may","maybe","me","meantime","meanwhile","might",
45	"notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once",	45	"moreover","most","mostly","more","mr","mrs","ms","much","must","my","myself",
46	"one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours",	46	"namely","need","neither","never","nevertheless","next","no","nobody","none",
47	"ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite",	47	"nonetheless","noone","nope","nor","not","nothing","notwithstanding","now","nowadays",
48	"rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing",	48	"nowhere","of","off","often","ok","on","once","one","only","onto","or","other",
49	"seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt",	49	"others","otherwise","ought","our","ours","ourselves","out","outside","over","own",
50	"she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote",	50	"per","perhaps","plenty","provide","quite","rather","really","round","said","sake",
51	"so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes",	51	"same","sang","save","saw","see","seeing","seem","seemed","seeming","seems","seen",
52	"somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave",	52	"seldom","selves","sent","several","shalt","she","should","shown","sideways","since",
53	"staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them",	53	"slept","slew","slung","slunk","smote","so","some","somebody","somehow","someone",
54	"themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts",	54	"something","sometime","sometimes","somewhat","somewhere","spake","spat","spoke",
55	"thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon",	55	"spoken","sprang","sprung","stave","staves","still","such","supposing","than","that",
56	"these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru",	56	"the","thee","their","them","themselves","then","thence","thenceforth","there",
57	"thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh",	57	"thereabout","thereabouts","thereafter","thereby","therefore","therein","thereof",
58	"unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward",	58	"thereon","thereto","thereupon","these","they","this","those","thou","though",
59	"upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week",	59	"thrice","through","throughout","thru","thus","thy","thyself","till","to","together",
60	"well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever",	60	"too","toward","towards","ugh","unable","under","underneath","unless","unlike",
61	"where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore",	61	"until","up","upon","upward","upwards","us","use","used","using","very","via","vs",
62	"wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto",	62	"want","was","we","week","well","were","what","whatever","whatsoever","when","whence",
63	"whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever",	63	"whenever","whensoever","where","whereabouts","whereafter","whereas","whereat",
64	"whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom",	64	"whereby","wherefore","wherefrom","wherein","whereinto","whereof","whereon",
65	"whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within",	65	"wheresoever","whereto","whereunto","whereupon","wherever","wherewith","whether",
66	"without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your",	66	"whew","which","whichever","whichsoever","while","whilst","whither","who","whoa",
67	"yours", "yourself", "yourselves"	67	"whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will",
		68	"wilt","with","within","without","worse","worst","would","wow","ye","yet","year",
		69	"yippee","you","your","yours","yourself","yourselves"
68	]	70	]
69		71
70	Transmap = {	72	Transmap = {
71	"\xC3\x80" => "A", "\xC3\x81" => "A", "\xC3\x82" => "A", "\xC3\x83" => "A",	73	"\xC3\x80" => "A", "\xC3\x81" => "A", "\xC3\x82" => "A", "\xC3\x83" => "A",
72	"\xC3\x84" => "A", "\xC3\x85" => "A", "\xC3\x86" => "AE","\xC3\x87" => "C",	74	"\xC3\x84" => "A", "\xC3\x85" => "A", "\xC3\x86" => "AE","\xC3\x87" => "C",
73	"\xC3\x88" => "E", "\xC3\x89" => "E", "\xC3\x8A" => "E", "\xC3\x8B" => "E",	75	"\xC3\x88" => "E", "\xC3\x89" => "E", "\xC3\x8A" => "E", "\xC3\x8B" => "E",
74	"\xC3\x8C" => "I", "\xC3\x8D" => "I", "\xC3\x8E" => "I", "\xC3\x8F" => "I",	76	"\xC3\x8C" => "I", "\xC3\x8D" => "I", "\xC3\x8E" => "I", "\xC3\x8F" => "I",
75	"\xC3\x90" => "D", "\xC3\x91" => "N", "\xC3\x92" => "O", "\xC3\x93" => "O",	77	"\xC3\x90" => "D", "\xC3\x91" => "N", "\xC3\x92" => "O", "\xC3\x93" => "O",
76	"\xC3\x94" => "O", "\xC3\x95" => "O", "\xC3\x96" => "O", "\xC3\x98" => "O",	78	"\xC3\x94" => "O", "\xC3\x95" => "O", "\xC3\x96" => "O", "\xC3\x98" => "O",
77	"\xC3\x99" => "U", "\xC3\x9A" => "U", "\xC3\x9B" => "U", "\xC3\x9C" => "U",	79	"\xC3\x99" => "U", "\xC3\x9A" => "U", "\xC3\x9B" => "U", "\xC3\x9C" => "U",
78	"\xC3\x9D" => "Y", "\xC3\x9E" => "P", "\xC3\x9F" => "ss",	80	"\xC3\x9D" => "Y", "\xC3\x9E" => "P", "\xC3\x9F" => "ss",
79	"\xC3\xA0" => "a", "\xC3\xA1" => "a", "\xC3\xA2" => "a", "\xC3\xA3" => "a",	81	"\xC3\xA0" => "a", "\xC3\xA1" => "a", "\xC3\xA2" => "a", "\xC3\xA3" => "a",
80	"\xC3\xA4" => "a", "\xC3\xA5" => "a", "\xC3\xA6" => "ae","\xC3\xA7" => "c",	82	"\xC3\xA4" => "a", "\xC3\xA5" => "a", "\xC3\xA6" => "ae","\xC3\xA7" => "c",
81	"\xC3\xA8" => "e", "\xC3\xA9" => "e", "\xC3\xAA" => "e", "\xC3\xAB" => "e",	83	"\xC3\xA8" => "e", "\xC3\xA9" => "e", "\xC3\xAA" => "e", "\xC3\xAB" => "e",
82	"\xC3\xAC" => "i", "\xC3\xAD" => "i", "\xC3\xAE" => "i", "\xC3\xAF" => "i",	84	"\xC3\xAC" => "i", "\xC3\xAD" => "i", "\xC3\xAE" => "i", "\xC3\xAF" => "i",
83	"\xC3\xB0" => "o", "\xC3\xB1" => "n", "\xC3\xB2" => "o", "\xC3\xB3" => "o",	85	"\xC3\xB0" => "o", "\xC3\xB1" => "n", "\xC3\xB2" => "o", "\xC3\xB3" => "o",
84	"\xC3\xB4" => "o", "\xC3\xB5" => "o", "\xC3\xB6" => "o", "\xC3\xB8" => "o",	86	"\xC3\xB4" => "o", "\xC3\xB5" => "o", "\xC3\xB6" => "o", "\xC3\xB8" => "o",
85	"\xC3\xB9" => "u", "\xC3\xBA" => "u", "\xC3\xBB" => "u", "\xC3\xBC" => "u",	87	"\xC3\xB9" => "u", "\xC3\xBA" => "u", "\xC3\xBB" => "u", "\xC3\xBC" => "u",
86	"\xC3\xBD" => "y", "\xC3\xBE" => "p", "\xC3\xBF" => "y",	88	"\xC3\xBD" => "y", "\xC3\xBE" => "p", "\xC3\xBF" => "y",
87	"\xC4\x80" => "A", "\xC4\x81" => "a", "\xC4\x82" => "A", "\xC4\x83" => "a",	89	"\xC4\x80" => "A", "\xC4\x81" => "a", "\xC4\x82" => "A", "\xC4\x83" => "a",
88	"\xC4\x84" => "A", "\xC4\x85" => "a", "\xC4\x86" => "C", "\xC4\x87" => "c",	90	"\xC4\x84" => "A", "\xC4\x85" => "a", "\xC4\x86" => "C", "\xC4\x87" => "c",
89	"\xC4\x88" => "C", "\xC4\x89" => "c", "\xC4\x8A" => "C", "\xC4\x8B" => "c",	91	"\xC4\x88" => "C", "\xC4\x89" => "c", "\xC4\x8A" => "C", "\xC4\x8B" => "c",
90	"\xC4\x8C" => "C", "\xC4\x8D" => "c", "\xC4\x8E" => "D", "\xC4\x8F" => "d",	92	"\xC4\x8C" => "C", "\xC4\x8D" => "c", "\xC4\x8E" => "D", "\xC4\x8F" => "d",
91	"\xC4\x90" => "D", "\xC4\x91" => "d", "\xC4\x92" => "E", "\xC4\x93" => "e",	93	"\xC4\x90" => "D", "\xC4\x91" => "d", "\xC4\x92" => "E", "\xC4\x93" => "e",
92	"\xC4\x94" => "E", "\xC4\x95" => "e", "\xC4\x96" => "E", "\xC4\x97" => "e",	94	"\xC4\x94" => "E", "\xC4\x95" => "e", "\xC4\x96" => "E", "\xC4\x97" => "e",
93	"\xC4\x98" => "E", "\xC4\x99" => "e", "\xC4\x9A" => "E", "\xC4\x9B" => "e",	95	"\xC4\x98" => "E", "\xC4\x99" => "e", "\xC4\x9A" => "E", "\xC4\x9B" => "e",
94	"\xC4\x9C" => "G", "\xC4\x9D" => "g", "\xC4\x9E" => "G", "\xC4\x9F" => "g",	96	"\xC4\x9C" => "G", "\xC4\x9D" => "g", "\xC4\x9E" => "G", "\xC4\x9F" => "g",
95	"\xC4\xA0" => "G", "\xC4\xA1" => "g", "\xC4\xA2" => "G", "\xC4\xA3" => "g",	97	"\xC4\xA0" => "G", "\xC4\xA1" => "g", "\xC4\xA2" => "G", "\xC4\xA3" => "g",
96	"\xC4\xA4" => "H", "\xC4\xA5" => "h", "\xC4\xA6" => "H", "\xC4\xA7" => "h",	98	"\xC4\xA4" => "H", "\xC4\xA5" => "h", "\xC4\xA6" => "H", "\xC4\xA7" => "h",
97	"\xC4\xA8" => "I", "\xC4\xA9" => "i", "\xC4\xAA" => "I", "\xC4\xAB" => "i",	99	"\xC4\xA8" => "I", "\xC4\xA9" => "i", "\xC4\xAA" => "I", "\xC4\xAB" => "i",
98	"\xC4\xAC" => "I", "\xC4\xAD" => "i", "\xC4\xAE" => "I", "\xC4\xAF" => "i",	100	"\xC4\xAC" => "I", "\xC4\xAD" => "i", "\xC4\xAE" => "I", "\xC4\xAF" => "i",
99	"\xC4\xB0" => "I", "\xC4\xB1" => "i", "\xC4\xB2" => "IJ","\xC4\xB3" => "ij",	101	"\xC4\xB0" => "I", "\xC4\xB1" => "i", "\xC4\xB2" => "IJ","\xC4\xB3" => "ij",
100	"\xC4\xB4" => "J", "\xC4\xB5" => "j", "\xC4\xB6" => "K", "\xC4\xB7" => "k",	102	"\xC4\xB4" => "J", "\xC4\xB5" => "j", "\xC4\xB6" => "K", "\xC4\xB7" => "k",
101	"\xC4\xB8" => "k", "\xC4\xB9" => "L", "\xC4\xBA" => "l", "\xC4\xBB" => "L",	103	"\xC4\xB8" => "k", "\xC4\xB9" => "L", "\xC4\xBA" => "l", "\xC4\xBB" => "L",
102	"\xC4\xBC" => "l", "\xC4\xBD" => "L", "\xC4\xBE" => "l", "\xC4\xBF" => "L",	104	"\xC4\xBC" => "l", "\xC4\xBD" => "L", "\xC4\xBE" => "l", "\xC4\xBF" => "L",
103	"\xC5\x80" => "l", "\xC5\x81" => "L", "\xC5\x82" => "l", "\xC5\x83" => "N",	105	"\xC5\x80" => "l", "\xC5\x81" => "L", "\xC5\x82" => "l", "\xC5\x83" => "N",
104	"\xC5\x84" => "n", "\xC5\x85" => "N", "\xC5\x86" => "n", "\xC5\x87" => "N",	106	"\xC5\x84" => "n", "\xC5\x85" => "N", "\xC5\x86" => "n", "\xC5\x87" => "N",
105	"\xC5\x88" => "n", "\xC5\x89" => "n", "\xC5\x8A" => "N", "\xC5\x8B" => "n",	107	"\xC5\x88" => "n", "\xC5\x89" => "n", "\xC5\x8A" => "N", "\xC5\x8B" => "n",
106	"\xC5\x8C" => "O", "\xC5\x8D" => "o", "\xC5\x8E" => "O", "\xC5\x8F" => "o",	108	"\xC5\x8C" => "O", "\xC5\x8D" => "o", "\xC5\x8E" => "O", "\xC5\x8F" => "o",
107	"\xC5\x90" => "O", "\xC5\x91" => "o", "\xC5\x92" => "CE","\xC5\x93" => "ce",	109	"\xC5\x90" => "O", "\xC5\x91" => "o", "\xC5\x92" => "CE","\xC5\x93" => "ce",
108	"\xC5\x94" => "R", "\xC5\x95" => "r", "\xC5\x96" => "R", "\xC5\x97" => "r",	110	"\xC5\x94" => "R", "\xC5\x95" => "r", "\xC5\x96" => "R", "\xC5\x97" => "r",
109	"\xC5\x98" => "R", "\xC5\x99" => "r", "\xC5\x9A" => "S", "\xC5\x9B" => "s",	111	"\xC5\x98" => "R", "\xC5\x99" => "r", "\xC5\x9A" => "S", "\xC5\x9B" => "s",
110	"\xC5\x9C" => "S", "\xC5\x9D" => "s", "\xC5\x9E" => "S", "\xC5\x9F" => "s",	112	"\xC5\x9C" => "S", "\xC5\x9D" => "s", "\xC5\x9E" => "S", "\xC5\x9F" => "s",
111	"\xC5\xA0" => "S", "\xC5\xA1" => "s", "\xC5\xA2" => "T", "\xC5\xA3" => "t",	113	"\xC5\xA0" => "S", "\xC5\xA1" => "s", "\xC5\xA2" => "T", "\xC5\xA3" => "t",
112	"\xC5\xA4" => "T", "\xC5\xA5" => "t", "\xC5\xA6" => "T", "\xC5\xA7" => "t",	114	"\xC5\xA4" => "T", "\xC5\xA5" => "t", "\xC5\xA6" => "T", "\xC5\xA7" => "t",
113	"\xC5\xA8" => "U", "\xC5\xA9" => "u", "\xC5\xAA" => "U", "\xC5\xAB" => "u",	115	"\xC5\xA8" => "U", "\xC5\xA9" => "u", "\xC5\xAA" => "U", "\xC5\xAB" => "u",
114	"\xC5\xAC" => "U", "\xC5\xAD" => "u", "\xC5\xAE" => "U", "\xC5\xAF" => "u",	116	"\xC5\xAC" => "U", "\xC5\xAD" => "u", "\xC5\xAE" => "U", "\xC5\xAF" => "u",
115	"\xC5\xB0" => "U", "\xC5\xB1" => "u", "\xC5\xB2" => "U", "\xC5\xB3" => "u",	117	"\xC5\xB0" => "U", "\xC5\xB1" => "u", "\xC5\xB2" => "U", "\xC5\xB3" => "u",
116	"\xC5\xB4" => "W", "\xC5\xB5" => "w", "\xC5\xB6" => "Y", "\xC5\xB7" => "y",	118	"\xC5\xB4" => "W", "\xC5\xB5" => "w", "\xC5\xB6" => "Y", "\xC5\xB7" => "y",
117	"\xC5\xB8" => "Y", "\xC5\xB9" => "Z", "\xC5\xBA" => "z", "\xC5\xBB" => "Z",	119	"\xC5\xB8" => "Y", "\xC5\xB9" => "Z", "\xC5\xBA" => "z", "\xC5\xBB" => "Z",
118	"\xC5\xBC" => "z", "\xC5\xBD" => "Z", "\xC5\xBE" => "z", "\xC6\x8F" => "E",	120	"\xC5\xBC" => "z", "\xC5\xBD" => "Z", "\xC5\xBE" => "z", "\xC6\x8F" => "E",
119	"\xC6\xA0" => "O", "\xC6\xA1" => "o", "\xC6\xAF" => "U", "\xC6\xB0" => "u",	121	"\xC6\xA0" => "O", "\xC6\xA1" => "o", "\xC6\xAF" => "U", "\xC6\xB0" => "u",
120	"\xC7\x8D" => "A", "\xC7\x8E" => "a", "\xC7\x8F" => "I",	122	"\xC7\x8D" => "A", "\xC7\x8E" => "a", "\xC7\x8F" => "I",
121	"\xC7\x90" => "i", "\xC7\x91" => "O", "\xC7\x92" => "o", "\xC7\x93" => "U",	123	"\xC7\x90" => "i", "\xC7\x91" => "O", "\xC7\x92" => "o", "\xC7\x93" => "U",
122	"\xC7\x94" => "u", "\xC7\x95" => "U", "\xC7\x96" => "u", "\xC7\x97" => "U",	124	"\xC7\x94" => "u", "\xC7\x95" => "U", "\xC7\x96" => "u", "\xC7\x97" => "U",
123	"\xC7\x98" => "u", "\xC7\x99" => "U", "\xC7\x9A" => "u", "\xC7\x9B" => "U",	125	"\xC7\x98" => "u", "\xC7\x99" => "U", "\xC7\x9A" => "u", "\xC7\x9B" => "U",
124	"\xC7\x9C" => "u",	126	"\xC7\x9C" => "u",
125	"\xC7\xBA" => "A", "\xC7\xBB" => "a", "\xC7\xBC" => "AE","\xC7\xBD" => "ae",	127	"\xC7\xBA" => "A", "\xC7\xBB" => "a", "\xC7\xBC" => "AE","\xC7\xBD" => "ae",
126	"\xC7\xBE" => "O", "\xC7\xBF" => "o",	128	"\xC7\xBE" => "O", "\xC7\xBF" => "o",
127	"\xC9\x99" => "e",	129	"\xC9\x99" => "e",
128	"\xC2\x82" => ",", # High code comma	130	"\xC2\x82" => ",", # High code comma
129	"\xC2\x84" => ",,", # High code double comma	131	"\xC2\x84" => ",,", # High code double comma
130	"\xC2\x85" => "...", # Tripple dot	132	"\xC2\x85" => "...", # Tripple dot
131	"\xC2\x88" => "^", # High carat	133	"\xC2\x88" => "^", # High carat
132	"\xC2\x91" => "\x27", # Forward single quote	134	"\xC2\x91" => "\x27", # Forward single quote
133	"\xC2\x92" => "\x27", # Reverse single quote	135	"\xC2\x92" => "\x27", # Reverse single quote
134	"\xC2\x93" => "\x22", # Forward double quote	136	"\xC2\x93" => "\x22", # Forward double quote
135	"\xC2\x94" => "\x22", # Reverse double quote	137	"\xC2\x94" => "\x22", # Reverse double quote
136	"\xC2\x96" => "-", # High hyphen	138	"\xC2\x96" => "-", # High hyphen
137	"\xC2\x97" => "--", # Double hyphen	139	"\xC2\x97" => "--", # Double hyphen
138	"\xC2\xA6" => "\|", # Split vertical bar	140	"\xC2\xA6" => "\|", # Split vertical bar
139	"\xC2\xAB" => "<<", # Double less than	141	"\xC2\xAB" => "<<", # Double less than
140	"\xC2\xBB" => ">>", # Double greater than	142	"\xC2\xBB" => ">>", # Double greater than
141	"\xC2\xBC" => "1/4", # one quarter	143	"\xC2\xBC" => "1/4", # one quarter
142	"\xC2\xBD" => "1/2", # one half	144	"\xC2\xBD" => "1/2", # one half
143	"\xC2\xBE" => "3/4", # three quarters	145	"\xC2\xBE" => "3/4", # three quarters
144	"\xCA\xBF" => "\x27", # c-single quote	146	"\xCA\xBF" => "\x27", # c-single quote
145	"\xCC\xA8" => "", # modifier - under curve	147	"\xCC\xA8" => "", # modifier - under curve
146	"\xCC\xB1" => "", # modifier - under line	148	"\xCC\xB1" => "", # modifier - under line
147	/\W/ => ""	149	# /\W/ => ""
148	}	150	}
149		151
150	end	152	end
151		153
152	# Extention of the standard class String with useful function.	154	# Extention of the standard class String with useful function.
153	class String	155	class String
154	include Mirimiri	156	include Mirimiri
155		157
156	def unaccent	158	def unaccent
157	# force_encoding is needed with ruby1.9	159	# force_encoding is needed with ruby1.9
158	Transmap.inject(self.force_encoding("ASCII-8BIT")) { \|str, (utf8, asc)\| str.gsub(utf8, asc) }	160	Transmap.inject(self.force_encoding("ASCII-8BIT")) { \|str, (utf8, asc)\| str.gsub(utf8, asc) }
159	end	161	end
160		162
161	# Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise.	163	# Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise.
162	def is_stopword?	164	def is_stopword?
163	Stoplist.include?(self.downcase)	165	Stoplist.include?(self.downcase)
164	end	166	end
165		167
166	# Do not use.	168	# Do not use.
167	# TODO: rewamp. find why this function is here.	169	# TODO: rewamp. find why this function is here.
168	def remove_special_characters	170	def remove_special_characters
169	self.split.collect { \|w\| w.gsub(/\W/,' ').split.collect { \|w\| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ')	171	self.split.collect { \|w\| w.gsub(/\W/,' ').split.collect { \|w\| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ')
170	end	172	end
171		173
172	# Removes all XML-like tags from +self+.	174	# Removes all XML-like tags from +self+.
173	#	175	#
174	# s = "<html><body>test</body></html>"	176	# s = "<html><body>test</body></html>"
175	# s.strip_xml_tags!	177	# s.strip_xml_tags!
176	# s #=> "test"	178	# s #=> "test"
177	def strip_xml_tags!	179	def strip_xml_tags!
178	replace strip_with_pattern /<\/?[^>]*>/	180	replace strip_with_pattern /<\/?[^>]*>/
179	end	181	end
180		182
181	# Removes all XML-like tags from +self+.	183	# Removes all XML-like tags from +self+.
182	#	184	#
183	# s = "<html><body>test</body></html>"	185	# s = "<html><body>test</body></html>"
184	# s.strip_xml_tags #=> "test"	186	# s.strip_xml_tags #=> "test"
185	# s #=> "<html><body>test</body></html>"	187	# s #=> "<html><body>test</body></html>"
186	def strip_xml_tags	188	def strip_xml_tags
187	dup.strip_xml_tags!	189	dup.strip_xml_tags!
188	end	190	end
189		191
190	# Removes all Javascript sources from +self+.	192	# Removes all Javascript sources from +self+.
191	#	193	#
192	# s = "<script type='text/javascript'>	194	# s = "<script type='text/javascript'>
193	# var skin='vector',	195	# var skin='vector',
194	# stylepath='http://bits.wikimedia.org/skins-1.5'	196	# stylepath='http://bits.wikimedia.org/skins-1.5'
195	# </script>	197	# </script>
196	#	198	#
197	# test"	199	# test"
198	# s.strip_javascripts!	200	# s.strip_javascripts!
199	# s #=> "test"	201	# s #=> "test"
200	def strip_javascripts!	202	def strip_javascripts!
201	replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m	203	replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m
202	end	204	end
203		205
204	# Removes all Javascript sources from +self+.	206	# Removes all Javascript sources from +self+.
205	#	207	#
206	# s = "<script type='text/javascript'>	208	# s = "<script type='text/javascript'>
207	# var skin='vector',	209	# var skin='vector',
208	# stylepath='http://bits.wikimedia.org/skins-1.5'	210	# stylepath='http://bits.wikimedia.org/skins-1.5'
209	# </script>	211	# </script>
210	#	212	#
211	# test"	213	# test"
212	# s.strip_javascripts #=> "test"	214	# s.strip_javascripts #=> "test"
213	def strip_javascripts	215	def strip_javascripts
214	dup.strip_javascripts!	216	dup.strip_javascripts!
215	end	217	end
216		218
217	def strip_stylesheets!	219	def strip_stylesheets!
218	# TODO: rewamp. dunno what is it.	220	# TODO: rewamp. dunno what is it.
219	replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m	221	replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m
220	end	222	end
221		223
222	def strip_stylesheets	224	def strip_stylesheets
223	dup.strip_stylesheets!	225	dup.strip_stylesheets!
224	end	226	end
225		227
226	# Removes punctuation from +self+.	228	# Removes punctuation from +self+.
227	#	229	#
228	# s = "hello, world. how are you?!"	230	# s = "hello, world. how are you?!"
229	# s.strip_punctuation!	231	# s.strip_punctuation!
230	# s # => "hello world how are you"	232	# s # => "hello world how are you"
231	def strip_punctuation!	233	def strip_punctuation!
232	replace strip_with_pattern /[^a-zA-Z0-9\-\s]/	234	replace strip_with_pattern /[^a-zA-Z0-9\-\s]/
233	end	235	end
234		236
235	# Removes punctuation from +self+.	237	# Removes punctuation from +self+.
236	#	238	#
237	# s = "hello, world. how are you?!"	239	# s = "hello, world. how are you?!"
238	# s.strip_punctuation # => "hello world how are you"	240	# s.strip_punctuation # => "hello world how are you"
239	def strip_punctuation	241	def strip_punctuation
240	dup.strip_punctuation!	242	dup.strip_punctuation!
241	end	243	end
242		244
243	# Returns the text values inside all occurences of a XML tag in +self+	245	# Returns the text values inside all occurences of a XML tag in +self+
244	#	246	#
245	# s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre"	247	# s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre"
246	# s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"]	248	# s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"]
247	def extract_xmltags_values(tag_name)	249	def extract_xmltags_values(tag_name)
248	self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten	250	self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten
249	end	251	end
250		252
251	def strip_with_pattern(pattern)	253	def strip_with_pattern(pattern)
252	require 'cgi'	254	require 'cgi'
253	require 'kconv'	255
254	CGI::unescapeHTML(self.gsub(pattern,"")).toutf8	256	CGI::unescapeHTML(self.gsub(pattern,"")).unaccent.encode("UTF-8", {:invalid => :replace, :undef => :replace, :replace => " "})
255	end	257	end
256		258
257	private :strip_with_pattern	259	private :strip_with_pattern
258	end	260	end
259		261