Commit 145387519e2023db9ed69bc07a52f1b71b6445fe

Authored by Romain Deveaud
1 parent fd4cb285a4
Exists in master

new stuff with wikipedia

Showing 8 changed files with 92 additions and 4 deletions Inline Diff

1 #!/usr/bin/env ruby 1 #!/usr/bin/env ruby
2 2
3 require 'rir/document' 3 require 'rir/document'
4 require 'rir/string' 4 require 'rir/string'
5 require 'rir/query' 5 require 'rir/query'
6 require 'rir/corpus'
7 require 'rir/regexp'
6 8
1 #!/usr/bin/env ruby 1 #!/usr/bin/env ruby
2 2
3 # This file is a part of an Information Retrieval oriented Ruby library 3 # This file is a part of an Information Retrieval oriented Ruby library
4 # 4 #
5 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> 5 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
6 # 6 #
7 # This program is free software: you can redistribute it and/or modify 7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by 8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or 9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version. 10 # (at your option) any later version.
11 # 11 #
12 # This program is distributed in the hope that it will be useful, 12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details. 15 # GNU General Public License for more details.
16 # 16 #
17 # You should have received a copy of the GNU General Public License 17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>. 18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19 19
20 # General module for many purposes related to Information Retrieval. 20 # General module for many purposes related to Information Retrieval.
21 module RIR 21 module RIR
22 22
23 class Corpus 23 class Corpus
24 attr_accessor :path 24 attr_accessor :path
25 25
26 def initialize(path) 26 def initialize(path)
27 @path = path 27 @path = path.chomp "/"
28 end 28 end
29 29
30 # Recursively outputs all files in +self.path+.
31 # WARNING ! This function may take a lot of time if many
32 # files are in subdirectories.
33 #
34 # c = Corpus.new "my/path"
35 # c.files # => ["README.txt", "lib/code.rb"]
30 def files 36 def files
31 Dir.glob("**/*.*") 37 Dir["#{@path}/**/*.*"]
32 end 38 end
33 end 39 end
34 40
35 end 41 end
36 42
1 #!/usr/bin/env ruby 1 #!/usr/bin/env ruby
2 2
3 # This file is a part of an Information Retrieval oriented Ruby library 3 # This file is a part of an Information Retrieval oriented Ruby library
4 # 4 #
5 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> 5 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
6 # 6 #
7 # This program is free software: you can redistribute it and/or modify 7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by 8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or 9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version. 10 # (at your option) any later version.
11 # 11 #
12 # This program is distributed in the hope that it will be useful, 12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details. 15 # GNU General Public License for more details.
16 # 16 #
17 # You should have received a copy of the GNU General Public License 17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>. 18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19 19
20 # General module for many purposes related to Information Retrieval. 20 # General module for many purposes related to Information Retrieval.
21 module RIR 21 module RIR
22 22
23 # A Document is a bag of words and is constructed from a string. 23 # A Document is a bag of words and is constructed from a string.
24 class Document 24 class Document
25 attr_reader :words, :doc_content 25 attr_reader :words, :doc_content
26 26
27 # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html 27 # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
28 # and the \\W special escape). 28 # and the \\W special escape).
29 # 29 #
30 # Protected function, only meant to by called at the initialization. 30 # Protected function, only meant to by called at the initialization.
31 def format_words 31 def format_words
32 wo = [] 32 wo = []
33 33
34 @doc_content.split.each do |w| 34 @doc_content.split.each do |w|
35 w.split(/\W/).each do |sw| 35 w.split(/\W/).each do |sw|
36 wo.push(sw) if sw =~ /[a-zA-Z]/ 36 wo.push(sw) if sw =~ /[a-zA-Z]/
37 end 37 end
38 end 38 end
39 39
40 wo 40 wo
41 end 41 end
42 42
43 # Returns an Array containing the +n+-grams (words) from the current Document. 43 # Returns an Array containing the +n+-grams (words) from the current Document.
44 # 44 #
45 # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] 45 # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
46 def ngrams(n) 46 def ngrams(n)
47 window = [] 47 window = []
48 ngrams_array = [] 48 ngrams_array = []
49 49
50 @words.each do |w| 50 @words.each do |w|
51 window.push(w) 51 window.push(w)
52 if window.size == n 52 if window.size == n
53 ngrams_array.push window.join(" ") 53 ngrams_array.push window.join(" ")
54 window.delete_at(0) 54 window.delete_at(0)
55 end 55 end
56 end 56 end
57 57
58 ngrams_array.uniq 58 ngrams_array.uniq
59 end 59 end
60 60
61 # Returns a Hash containing the words and their associated counts in the current Document. 61 # Returns a Hash containing the words and their associated counts in the current Document.
62 # 62 #
63 # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } 63 # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
64 def count_words 64 def count_words
65 counts = Hash.new { |h,k| h[k] = 0 } 65 counts = Hash.new { |h,k| h[k] = 0 }
66 @words.each { |w| counts[w.downcase] += 1 } 66 @words.each { |w| counts[w.downcase] += 1 }
67 67
68 counts 68 counts
69 end 69 end
70 70
71 # Computes the entropy of a given string +s+ inside the document. 71 # Computes the entropy of a given string +s+ inside the document.
72 # 72 #
73 # If the string parameter is composed of many words (i.e. tokens separated 73 # If the string parameter is composed of many words (i.e. tokens separated
74 # by whitespace(s)), it is considered as an ngram. 74 # by whitespace(s)), it is considered as an ngram.
75 # 75 #
76 # entropy("guitar") #=> 0.00389919463243839 76 # entropy("guitar") #=> 0.00389919463243839
77 def entropy(s) 77 def entropy(s)
78 en = 0.0 78 en = 0.0
79 counts = self.count_words 79 counts = self.count_words
80 80
81 s.split.each do |w| 81 s.split.each do |w|
82 p_wi = counts[w].to_f/@words.count.to_f 82 p_wi = counts[w].to_f/@words.count.to_f
83 en += p_wi*Math.log2(p_wi) 83 en += p_wi*Math.log2(p_wi)
84 end 84 end
85 85
86 en *= -1 86 en *= -1
87 en 87 en
88 end 88 end
89 89
90 90
91 91
92 def initialize(content) 92 def initialize(content)
93 @doc_content = content 93 @doc_content = content
94 @words = format_words 94 @words = format_words
95 end 95 end
96 96
97 protected :format_words 97 protected :format_words
98 end 98 end
99 99
100 # A WebDocument is a Document with a +url+. 100 # A WebDocument is a Document with a +url+.
101 class WebDocument < Document 101 class WebDocument < Document
102 attr_reader :url 102 attr_reader :url
103 103
104 # Returns the HTML text from the page of a given +url+. 104 # Returns the HTML text from the page of a given +url+.
105 def self.get_content(url) 105 def self.get_content(url)
106 require 'net/http' 106 require 'net/http'
107 Net::HTTP.get(URI.parse(url)) 107 Net::HTTP.get(URI.parse(url))
108 end 108 end
109 109
110 # WebDocument constructor, the content of the Document is the HTML page 110 # WebDocument constructor, the content of the Document is the HTML page
111 # without the tags. 111 # without the tags.
112 def initialize(url) 112 def initialize(url)
113 @url = url 113 @url = url
114 super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags 114 super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags
115 end 115 end
116 end 116 end
117 117
118 # A WikipediaPage is a WebDocument. 118 # A WikipediaPage is a WebDocument.
119 class WikipediaPage < WebDocument 119 class WikipediaPage < WebDocument
120 require 'rexml/document'
121 require 'net/http'
122 require 'kconv'
123
124
125 def self.search_wikipedia_titles(name)
126 res = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml")).toutf8).elements['api/query/search']
127
128 res.collect { |e| e.attributes['title'] } unless res.nil?
129 end
130
131 def self.get_url(name)
132 atts = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml")).toutf8).elements['api/query/pages/page'].attributes
133
134 atts['fullurl'] if atts['missing'].nil?
135 end
136
137 def self.search_homepage(name)
138 title = WikipediaPage.search_wikipedia_titles name
139
140 begin
141 WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?
142 rescue
143 puts title[0]
144 end
145 end
146
147 # def initialize(name)
148 # title = WikipediaPage.search_wikipedia_titles name
149 # raise ArgumentError, "No page found" if title.empty?
150 # super WikipediaPage.get_url title[0]
151 # end
120 end 152 end
121 end 153 end
122 154
1 #!/usr/bin/env ruby 1 #!/usr/bin/env ruby
2 2
3 # This file is a part of an Information Retrieval oriented Ruby library 3 # This file is a part of an Information Retrieval oriented Ruby library
4 # 4 #
5 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> 5 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
6 # 6 #
7 # This program is free software: you can redistribute it and/or modify 7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by 8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or 9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version. 10 # (at your option) any later version.
11 # 11 #
12 # This program is distributed in the hope that it will be useful, 12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details. 15 # GNU General Public License for more details.
16 # 16 #
17 # You should have received a copy of the GNU General Public License 17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>. 18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19 19
20 # General module for many purposes related to Information Retrieval. 20 # General module for many purposes related to Information Retrieval.
21 module RIR 21 module RIR
22 22
23 class Query 23 class Query
24 end 24 end
25 25
26 module Indri 26 module Indri
27 27
28 class Parameters 28 class Parameters
29 attr_accessor :corpus, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline 29 attr_accessor :corpus, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline
30 30
31 def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false) 31 def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false)
32 @corpus = corpus 32 @corpus = corpus
33 @memory = mem 33 @memory = mem
34 @count = count 34 @count = count
35 @offset = offset 35 @offset = offset
36 @run_id = run_id 36 @run_id = run_id
37 @print_query = print_query ? "true" : "false" 37 @print_query = print_query ? "true" : "false"
38 @print_docs = print_docs ? "true" : "false" 38 @print_docs = print_docs ? "true" : "false"
39 end 39 end
40 40
41 def to_s 41 def to_s
42 h = "<parameters>\n" 42 h = "<parameters>\n"
43 h += "<memory>#{@memory}</memory>\n" 43 h += "<memory>#{@memory}</memory>\n"
44 h += "<index>#{@corpus}</index>\n" 44 h += "<index>#{@corpus}</index>\n"
45 h += "<count>#{@count}</count>\n" 45 h += "<count>#{@count}</count>\n"
46 unless @baseline.nil? 46 unless @baseline.nil?
47 h += "<baseline>#{@baseline}</baseline>\n" 47 h += "<baseline>#{@baseline}</baseline>\n"
48 else 48 else
49 h += "<rule>#{@rule}</rule>\n" 49 h += "<rule>#{@rule}</rule>\n"
50 end 50 end
51 h += "<queryOffset>#{@offset}</queryOffset>\n" 51 h += "<queryOffset>#{@offset}</queryOffset>\n"
52 h += "<runID>#{@run_id}</runID>\n" 52 h += "<runID>#{@run_id}</runID>\n"
53 h += "<printQuery>#{@print_query}</printQuery>\n" 53 h += "<printQuery>#{@print_query}</printQuery>\n"
54 h += "<printDocuments>#{@print_docs}</printDocuments>\n" 54 h += "<printDocuments>#{@print_docs}</printDocuments>\n"
55 55
56 h 56 h
57 end 57 end
58 end 58 end
59 59
60 class IndriQuery < Query 60 class IndriQuery < Query
61 attr_accessor :id, :query, :params, :rule 61 attr_accessor :id, :query, :params, :rule
62 62
63 def initialize(id,query,params) 63 def initialize(id,query,params)
64 # @params = Parameters === params ? params : Parameters.new(corpus)
65 @params = params 64 @params = params
66 # Here we set the default retrieval model as Language Modeling 65 # Here we set the default retrieval model as Language Modeling
67 # with a Dirichlet smoothing at 2500. 66 # with a Dirichlet smoothing at 2500.
68 # TODO: maybe a Rule class... 67 # TODO: maybe a Rule class...
69 @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? 68 @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil?
70 69
71 @id = id 70 @id = id
72 @query = query 71 @query = query
73 end 72 end
74 73
75 def to_s 74 def to_s
76 h = @params.to_s 75 h = @params.to_s
77 h += "<query>\n" 76 h += "<query>\n"
78 h += "<number>#{@id}</number>\n" 77 h += "<number>#{@id}</number>\n"
79 h += "<text>#{@query}</text>\n" 78 h += "<text>#{@query}</text>\n"
80 h += "</query>\n" 79 h += "</query>\n"
81 h += "</parameters>" 80 h += "</parameters>"
82 81
83 h 82 h
84 end 83 end
85 end 84 end
86 85
87 end 86 end
88 end 87 end
89 88
File was created 1 #!/usr/bin/env ruby
2
3 # This file is a part of an Information Retrieval oriented Ruby library
4 #
5 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
6 #
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11 #
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
16 #
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
20 class Regexp
21
22 def negated
23 /^((?!#{self}).)*$/
24 end
25
26 end
27
1 #!/usr/bin/env ruby 1 #!/usr/bin/env ruby
2 2
3 # This file is a part of an Information Retrieval oriented Ruby library 3 # This file is a part of an Information Retrieval oriented Ruby library
4 # 4 #
5 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> 5 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
6 # 6 #
7 # This program is free software: you can redistribute it and/or modify 7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by 8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or 9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version. 10 # (at your option) any later version.
11 # 11 #
12 # This program is distributed in the hope that it will be useful, 12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details. 15 # GNU General Public License for more details.
16 # 16 #
17 # You should have received a copy of the GNU General Public License 17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>. 18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19 19
20 # General module for many purposes related to Information Retrieval. 20 # General module for many purposes related to Information Retrieval.
21 module RIR 21 module RIR
22 22
23 # These are the default stopwords provided by Lemur. 23 # These are the default stopwords provided by Lemur.
24 Stoplist = [ 24 Stoplist = [
25 "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av", 25 "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av",
26 "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", 26 "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand",
27 "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", 27 "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by",
28 "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu", 28 "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu",
29 "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during", 29 "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during",
30 "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every", 30 "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every",
31 "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting", 31 "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting",
32 "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff", 32 "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff",
33 "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore", 33 "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore",
34 "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he", 34 "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he",
35 "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto", 35 "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto",
36 "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto", 36 "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto",
37 "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include", 37 "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include",
38 "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into", 38 "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into",
39 "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last", 39 "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last",
40 "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe", 40 "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe",
41 "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs", 41 "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs",
42 "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless", 42 "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless",
43 "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing", 43 "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing",
44 "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once", 44 "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once",
45 "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", 45 "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours",
46 "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite", 46 "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite",
47 "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing", 47 "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing",
48 "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt", 48 "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt",
49 "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote", 49 "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote",
50 "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", 50 "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes",
51 "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave", 51 "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave",
52 "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them", 52 "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them",
53 "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts", 53 "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts",
54 "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", 54 "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon",
55 "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru", 55 "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru",
56 "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh", 56 "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh",
57 "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward", 57 "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward",
58 "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week", 58 "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week",
59 "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", 59 "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever",
60 "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore", 60 "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore",
61 "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", 61 "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto",
62 "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever", 62 "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever",
63 "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom", 63 "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom",
64 "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within", 64 "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within",
65 "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your", 65 "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your",
66 "yours", "yourself", "yourselves" 66 "yours", "yourself", "yourselves"
67 ] 67 ]
68 68
69 69
70 end 70 end
71 71
72 # Extention of the standard class String with useful function. 72 # Extention of the standard class String with useful function.
73 class String 73 class String
74 include RIR 74 include RIR
75 75
76 # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise. 76 # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise.
77 def is_stopword? 77 def is_stopword?
78 Stoplist.include?(self.downcase) 78 Stoplist.include?(self.downcase)
79 end 79 end
80 80
81 # Do not use. 81 # Do not use.
82 # TODO: rewamp. find why this function is here. 82 # TODO: rewamp. find why this function is here.
83 def remove_special_characters 83 def remove_special_characters
84 self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ') 84 self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ')
85 end 85 end
86 86
87 # Removes all XML-like tags from +self+. 87 # Removes all XML-like tags from +self+.
88 # 88 #
89 # s = "<html><body>test</body></html>" 89 # s = "<html><body>test</body></html>"
90 # s.strip_xml_tags! 90 # s.strip_xml_tags!
91 # s #=> "test" 91 # s #=> "test"
92 def strip_xml_tags! 92 def strip_xml_tags!
93 replace strip_with_pattern /<\/?[^>]*>/ 93 replace strip_with_pattern /<\/?[^>]*>/
94 end 94 end
95 95
96 # Removes all XML-like tags from +self+. 96 # Removes all XML-like tags from +self+.
97 # 97 #
98 # s = "<html><body>test</body></html>" 98 # s = "<html><body>test</body></html>"
99 # s.strip_xml_tags #=> "test" 99 # s.strip_xml_tags #=> "test"
100 # s #=> "<html><body>test</body></html>" 100 # s #=> "<html><body>test</body></html>"
101 def strip_xml_tags 101 def strip_xml_tags
102 dup.strip_xml_tags! 102 dup.strip_xml_tags!
103 end 103 end
104 104
105 # Removes all Javascript sources from +self+. 105 # Removes all Javascript sources from +self+.
106 # 106 #
107 # s = "<script type='text/javascript'> 107 # s = "<script type='text/javascript'>
108 # var skin='vector', 108 # var skin='vector',
109 # stylepath='http://bits.wikimedia.org/skins-1.5' 109 # stylepath='http://bits.wikimedia.org/skins-1.5'
110 # </script> 110 # </script>
111 # 111 #
112 # test" 112 # test"
113 # s.strip_javascripts! 113 # s.strip_javascripts!
114 # s #=> "test" 114 # s #=> "test"
115 def strip_javascripts! 115 def strip_javascripts!
116 replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m 116 replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m
117 end 117 end
118 118
119 # Removes all Javascript sources from +self+. 119 # Removes all Javascript sources from +self+.
120 # 120 #
121 # s = "<script type='text/javascript'> 121 # s = "<script type='text/javascript'>
122 # var skin='vector', 122 # var skin='vector',
123 # stylepath='http://bits.wikimedia.org/skins-1.5' 123 # stylepath='http://bits.wikimedia.org/skins-1.5'
124 # </script> 124 # </script>
125 # 125 #
126 # test" 126 # test"
127 # s.strip_javascripts #=> "test" 127 # s.strip_javascripts #=> "test"
128 def strip_javascripts 128 def strip_javascripts
129 dup.strip_javascripts! 129 dup.strip_javascripts!
130 end 130 end
131 131
132 def strip_stylesheets! 132 def strip_stylesheets!
133 # TODO: rewamp. dunno what is it. 133 # TODO: rewamp. dunno what is it.
134 replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m 134 replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m
135 end 135 end
136 136
137 def strip_stylesheets 137 def strip_stylesheets
138 dup.strip_stylesheets! 138 dup.strip_stylesheets!
139 end 139 end
140 140
141 # Removes punctuation from +self+.
142 #
143 # s = "hello, world. how are you?!"
144 # s.strip_punctuation!
145 # s # => "hello world how are you"
146 def strip_punctuation!
147 replace strip_with_pattern /[^a-zA-Z0-9\-\s]/
148 end
149
150 # Removes punctuation from +self+.
151 #
152 # s = "hello, world. how are you?!"
153 # s.strip_punctuation # => "hello world how are you"
154 def strip_punctuation
155 dup.strip_punctuation!
156 end
157
141 # Returns the text values inside all occurences of a XML tag in +self+ 158 # Returns the text values inside all occurences of a XML tag in +self+
142 # 159 #
143 # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre" 160 # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre"
144 # s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"] 161 # s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"]
145 def extract_xmltags_values(tag_name) 162 def extract_xmltags_values(tag_name)
146 self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten 163 self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten
147 end 164 end
148 165
149 def strip_with_pattern(pattern) 166 def strip_with_pattern(pattern)
150 require 'cgi' 167 require 'cgi'
151 require 'kconv' 168 require 'kconv'
152 CGI::unescapeHTML(self.gsub(pattern,"")).toutf8 169 CGI::unescapeHTML(self.gsub(pattern,"")).toutf8
153 end 170 end
154 171
155 private :strip_with_pattern 172 private :strip_with_pattern
156 end 173 end
157 174
1 $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib")) 1 $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib"))
2 2
3 require 'rir' 3 require 'rir'
4 4
5 w = RIR::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan") 5 w = RIR::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan")
6 p w.entropy("guitar") 6 p w.entropy("guitar")
7 7
8 params = RIR::Indri::Parameters.new("path_vers_mon_index") 8 params = RIR::Indri::Parameters.new("path_vers_mon_index")
9 p params.rule
10 q = RIR::Indri::IndriQuery.new("pouet", "bla", params) 9 q = RIR::Indri::IndriQuery.new("pouet", "bla", params)
11 puts q 10 puts q
11
12 c = RIR::Corpus.new "/home/romain/INEX/BookTrack/corpus/"
13 puts c.files.size
1 #!/usr/bin/env ruby 1 #!/usr/bin/env ruby
2 2
3 require 'test/unit' 3 require 'test/unit'
4 4
5 require 'string' 5 require 'string'
6 6
7 class TestString < Test::Unit::TestCase 7 class TestString < Test::Unit::TestCase
8 8
9 def test_extract_xml 9 def test_extract_xml
10 s = "four-piece in <a>Indianapolis</a>, <a>Indiana</a> at the Murat Theatre" 10 s = "four-piece in <a>Indianapolis</a>, <a>Indiana</a> at the Murat Theatre"
11 assert_equal(["Indianapolis", "Indiana"],s.extract_xmltags_values('a')) 11 assert_equal(["Indianapolis", "Indiana"],s.extract_xmltags_values('a'))
12 end 12 end
13 13
14 def test_stopword 14 def test_stopword
15 assert_equal(true, "is".is_stopword?) 15 assert_equal(true, "is".is_stopword?)
16 assert_equal(true, "seen".is_stopword?) 16 assert_equal(true, "seen".is_stopword?)
17 assert_equal(false, "totally".is_stopword?) 17 assert_equal(false, "totally".is_stopword?)
18 assert_equal(false, "Paris".is_stopword?) 18 assert_equal(false, "Paris".is_stopword?)
19 end 19 end
20 20
21 def test_strip_xml 21 def test_strip_xml
22 assert_equal("testme", "<test>testme</test>".strip_xml_tags) 22 assert_equal("testme", "<test>testme</test>".strip_xml_tags)
23 end 23 end
24
25 def test_strip_punctuation
26 assert_equal("test test test test test test", "test, test. .test, ;test !! ? test ...test./".strip_punctuation)
27 end
24 end 28 end
25 29