Commit 145387519e2023db9ed69bc07a52f1b71b6445fe
1 parent
fd4cb285a4
Exists in
master
new stuff with wikipedia
Showing 8 changed files with 92 additions and 4 deletions Inline Diff
lib/rir.rb
1 | #!/usr/bin/env ruby | 1 | #!/usr/bin/env ruby |
2 | 2 | ||
3 | require 'rir/document' | 3 | require 'rir/document' |
4 | require 'rir/string' | 4 | require 'rir/string' |
5 | require 'rir/query' | 5 | require 'rir/query' |
6 | require 'rir/corpus' | ||
7 | require 'rir/regexp' | ||
6 | 8 |
lib/rir/corpus.rb
1 | #!/usr/bin/env ruby | 1 | #!/usr/bin/env ruby |
2 | 2 | ||
3 | # This file is a part of an Information Retrieval oriented Ruby library | 3 | # This file is a part of an Information Retrieval oriented Ruby library |
4 | # | 4 | # |
5 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | 5 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> |
6 | # | 6 | # |
7 | # This program is free software: you can redistribute it and/or modify | 7 | # This program is free software: you can redistribute it and/or modify |
8 | # it under the terms of the GNU General Public License as published by | 8 | # it under the terms of the GNU General Public License as published by |
9 | # the Free Software Foundation, either version 3 of the License, or | 9 | # the Free Software Foundation, either version 3 of the License, or |
10 | # (at your option) any later version. | 10 | # (at your option) any later version. |
11 | # | 11 | # |
12 | # This program is distributed in the hope that it will be useful, | 12 | # This program is distributed in the hope that it will be useful, |
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | # GNU General Public License for more details. | 15 | # GNU General Public License for more details. |
16 | # | 16 | # |
17 | # You should have received a copy of the GNU General Public License | 17 | # You should have received a copy of the GNU General Public License |
18 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | 18 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
19 | 19 | ||
20 | # General module for many purposes related to Information Retrieval. | 20 | # General module for many purposes related to Information Retrieval. |
21 | module RIR | 21 | module RIR |
22 | 22 | ||
23 | class Corpus | 23 | class Corpus |
24 | attr_accessor :path | 24 | attr_accessor :path |
25 | 25 | ||
26 | def initialize(path) | 26 | def initialize(path) |
27 | @path = path | 27 | @path = path.chomp "/" |
28 | end | 28 | end |
29 | 29 | ||
30 | # Recursively outputs all files in +self.path+. | ||
31 | # WARNING ! This function may take a lot of time if many | ||
32 | # files are in subdirectories. | ||
33 | # | ||
34 | # c = Corpus.new "my/path" | ||
35 | # c.files # => ["README.txt", "lib/code.rb"] | ||
30 | def files | 36 | def files |
31 | Dir.glob("**/*.*") | 37 | Dir["#{@path}/**/*.*"] |
32 | end | 38 | end |
33 | end | 39 | end |
34 | 40 | ||
35 | end | 41 | end |
36 | 42 |
lib/rir/document.rb
1 | #!/usr/bin/env ruby | 1 | #!/usr/bin/env ruby |
2 | 2 | ||
3 | # This file is a part of an Information Retrieval oriented Ruby library | 3 | # This file is a part of an Information Retrieval oriented Ruby library |
4 | # | 4 | # |
5 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | 5 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> |
6 | # | 6 | # |
7 | # This program is free software: you can redistribute it and/or modify | 7 | # This program is free software: you can redistribute it and/or modify |
8 | # it under the terms of the GNU General Public License as published by | 8 | # it under the terms of the GNU General Public License as published by |
9 | # the Free Software Foundation, either version 3 of the License, or | 9 | # the Free Software Foundation, either version 3 of the License, or |
10 | # (at your option) any later version. | 10 | # (at your option) any later version. |
11 | # | 11 | # |
12 | # This program is distributed in the hope that it will be useful, | 12 | # This program is distributed in the hope that it will be useful, |
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | # GNU General Public License for more details. | 15 | # GNU General Public License for more details. |
16 | # | 16 | # |
17 | # You should have received a copy of the GNU General Public License | 17 | # You should have received a copy of the GNU General Public License |
18 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | 18 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
19 | 19 | ||
20 | # General module for many purposes related to Information Retrieval. | 20 | # General module for many purposes related to Information Retrieval. |
21 | module RIR | 21 | module RIR |
22 | 22 | ||
23 | # A Document is a bag of words and is constructed from a string. | 23 | # A Document is a bag of words and is constructed from a string. |
24 | class Document | 24 | class Document |
25 | attr_reader :words, :doc_content | 25 | attr_reader :words, :doc_content |
26 | 26 | ||
27 | # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html | 27 | # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html |
28 | # and the \\W special escape). | 28 | # and the \\W special escape). |
29 | # | 29 | # |
30 | # Protected function, only meant to by called at the initialization. | 30 | # Protected function, only meant to by called at the initialization. |
31 | def format_words | 31 | def format_words |
32 | wo = [] | 32 | wo = [] |
33 | 33 | ||
34 | @doc_content.split.each do |w| | 34 | @doc_content.split.each do |w| |
35 | w.split(/\W/).each do |sw| | 35 | w.split(/\W/).each do |sw| |
36 | wo.push(sw) if sw =~ /[a-zA-Z]/ | 36 | wo.push(sw) if sw =~ /[a-zA-Z]/ |
37 | end | 37 | end |
38 | end | 38 | end |
39 | 39 | ||
40 | wo | 40 | wo |
41 | end | 41 | end |
42 | 42 | ||
43 | # Returns an Array containing the +n+-grams (words) from the current Document. | 43 | # Returns an Array containing the +n+-grams (words) from the current Document. |
44 | # | 44 | # |
45 | # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] | 45 | # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] |
46 | def ngrams(n) | 46 | def ngrams(n) |
47 | window = [] | 47 | window = [] |
48 | ngrams_array = [] | 48 | ngrams_array = [] |
49 | 49 | ||
50 | @words.each do |w| | 50 | @words.each do |w| |
51 | window.push(w) | 51 | window.push(w) |
52 | if window.size == n | 52 | if window.size == n |
53 | ngrams_array.push window.join(" ") | 53 | ngrams_array.push window.join(" ") |
54 | window.delete_at(0) | 54 | window.delete_at(0) |
55 | end | 55 | end |
56 | end | 56 | end |
57 | 57 | ||
58 | ngrams_array.uniq | 58 | ngrams_array.uniq |
59 | end | 59 | end |
60 | 60 | ||
61 | # Returns a Hash containing the words and their associated counts in the current Document. | 61 | # Returns a Hash containing the words and their associated counts in the current Document. |
62 | # | 62 | # |
63 | # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } | 63 | # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } |
64 | def count_words | 64 | def count_words |
65 | counts = Hash.new { |h,k| h[k] = 0 } | 65 | counts = Hash.new { |h,k| h[k] = 0 } |
66 | @words.each { |w| counts[w.downcase] += 1 } | 66 | @words.each { |w| counts[w.downcase] += 1 } |
67 | 67 | ||
68 | counts | 68 | counts |
69 | end | 69 | end |
70 | 70 | ||
71 | # Computes the entropy of a given string +s+ inside the document. | 71 | # Computes the entropy of a given string +s+ inside the document. |
72 | # | 72 | # |
73 | # If the string parameter is composed of many words (i.e. tokens separated | 73 | # If the string parameter is composed of many words (i.e. tokens separated |
74 | # by whitespace(s)), it is considered as an ngram. | 74 | # by whitespace(s)), it is considered as an ngram. |
75 | # | 75 | # |
76 | # entropy("guitar") #=> 0.00389919463243839 | 76 | # entropy("guitar") #=> 0.00389919463243839 |
77 | def entropy(s) | 77 | def entropy(s) |
78 | en = 0.0 | 78 | en = 0.0 |
79 | counts = self.count_words | 79 | counts = self.count_words |
80 | 80 | ||
81 | s.split.each do |w| | 81 | s.split.each do |w| |
82 | p_wi = counts[w].to_f/@words.count.to_f | 82 | p_wi = counts[w].to_f/@words.count.to_f |
83 | en += p_wi*Math.log2(p_wi) | 83 | en += p_wi*Math.log2(p_wi) |
84 | end | 84 | end |
85 | 85 | ||
86 | en *= -1 | 86 | en *= -1 |
87 | en | 87 | en |
88 | end | 88 | end |
89 | 89 | ||
90 | 90 | ||
91 | 91 | ||
92 | def initialize(content) | 92 | def initialize(content) |
93 | @doc_content = content | 93 | @doc_content = content |
94 | @words = format_words | 94 | @words = format_words |
95 | end | 95 | end |
96 | 96 | ||
97 | protected :format_words | 97 | protected :format_words |
98 | end | 98 | end |
99 | 99 | ||
100 | # A WebDocument is a Document with a +url+. | 100 | # A WebDocument is a Document with a +url+. |
101 | class WebDocument < Document | 101 | class WebDocument < Document |
102 | attr_reader :url | 102 | attr_reader :url |
103 | 103 | ||
104 | # Returns the HTML text from the page of a given +url+. | 104 | # Returns the HTML text from the page of a given +url+. |
105 | def self.get_content(url) | 105 | def self.get_content(url) |
106 | require 'net/http' | 106 | require 'net/http' |
107 | Net::HTTP.get(URI.parse(url)) | 107 | Net::HTTP.get(URI.parse(url)) |
108 | end | 108 | end |
109 | 109 | ||
110 | # WebDocument constructor, the content of the Document is the HTML page | 110 | # WebDocument constructor, the content of the Document is the HTML page |
111 | # without the tags. | 111 | # without the tags. |
112 | def initialize(url) | 112 | def initialize(url) |
113 | @url = url | 113 | @url = url |
114 | super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags | 114 | super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags |
115 | end | 115 | end |
116 | end | 116 | end |
117 | 117 | ||
118 | # A WikipediaPage is a WebDocument. | 118 | # A WikipediaPage is a WebDocument. |
119 | class WikipediaPage < WebDocument | 119 | class WikipediaPage < WebDocument |
120 | require 'rexml/document' | ||
121 | require 'net/http' | ||
122 | require 'kconv' | ||
123 | |||
124 | |||
125 | def self.search_wikipedia_titles(name) | ||
126 | res = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml")).toutf8).elements['api/query/search'] | ||
127 | |||
128 | res.collect { |e| e.attributes['title'] } unless res.nil? | ||
129 | end | ||
130 | |||
131 | def self.get_url(name) | ||
132 | atts = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml")).toutf8).elements['api/query/pages/page'].attributes | ||
133 | |||
134 | atts['fullurl'] if atts['missing'].nil? | ||
135 | end | ||
136 | |||
137 | def self.search_homepage(name) | ||
138 | title = WikipediaPage.search_wikipedia_titles name | ||
139 | |||
140 | begin | ||
141 | WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? | ||
142 | rescue | ||
143 | puts title[0] | ||
144 | end | ||
145 | end | ||
146 | |||
147 | # def initialize(name) | ||
148 | # title = WikipediaPage.search_wikipedia_titles name | ||
149 | # raise ArgumentError, "No page found" if title.empty? | ||
150 | # super WikipediaPage.get_url title[0] | ||
151 | # end | ||
120 | end | 152 | end |
121 | end | 153 | end |
122 | 154 |
lib/rir/query.rb
1 | #!/usr/bin/env ruby | 1 | #!/usr/bin/env ruby |
2 | 2 | ||
3 | # This file is a part of an Information Retrieval oriented Ruby library | 3 | # This file is a part of an Information Retrieval oriented Ruby library |
4 | # | 4 | # |
5 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | 5 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> |
6 | # | 6 | # |
7 | # This program is free software: you can redistribute it and/or modify | 7 | # This program is free software: you can redistribute it and/or modify |
8 | # it under the terms of the GNU General Public License as published by | 8 | # it under the terms of the GNU General Public License as published by |
9 | # the Free Software Foundation, either version 3 of the License, or | 9 | # the Free Software Foundation, either version 3 of the License, or |
10 | # (at your option) any later version. | 10 | # (at your option) any later version. |
11 | # | 11 | # |
12 | # This program is distributed in the hope that it will be useful, | 12 | # This program is distributed in the hope that it will be useful, |
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | # GNU General Public License for more details. | 15 | # GNU General Public License for more details. |
16 | # | 16 | # |
17 | # You should have received a copy of the GNU General Public License | 17 | # You should have received a copy of the GNU General Public License |
18 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | 18 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
19 | 19 | ||
20 | # General module for many purposes related to Information Retrieval. | 20 | # General module for many purposes related to Information Retrieval. |
21 | module RIR | 21 | module RIR |
22 | 22 | ||
23 | class Query | 23 | class Query |
24 | end | 24 | end |
25 | 25 | ||
26 | module Indri | 26 | module Indri |
27 | 27 | ||
28 | class Parameters | 28 | class Parameters |
29 | attr_accessor :corpus, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline | 29 | attr_accessor :corpus, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline |
30 | 30 | ||
31 | def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false) | 31 | def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false) |
32 | @corpus = corpus | 32 | @corpus = corpus |
33 | @memory = mem | 33 | @memory = mem |
34 | @count = count | 34 | @count = count |
35 | @offset = offset | 35 | @offset = offset |
36 | @run_id = run_id | 36 | @run_id = run_id |
37 | @print_query = print_query ? "true" : "false" | 37 | @print_query = print_query ? "true" : "false" |
38 | @print_docs = print_docs ? "true" : "false" | 38 | @print_docs = print_docs ? "true" : "false" |
39 | end | 39 | end |
40 | 40 | ||
41 | def to_s | 41 | def to_s |
42 | h = "<parameters>\n" | 42 | h = "<parameters>\n" |
43 | h += "<memory>#{@memory}</memory>\n" | 43 | h += "<memory>#{@memory}</memory>\n" |
44 | h += "<index>#{@corpus}</index>\n" | 44 | h += "<index>#{@corpus}</index>\n" |
45 | h += "<count>#{@count}</count>\n" | 45 | h += "<count>#{@count}</count>\n" |
46 | unless @baseline.nil? | 46 | unless @baseline.nil? |
47 | h += "<baseline>#{@baseline}</baseline>\n" | 47 | h += "<baseline>#{@baseline}</baseline>\n" |
48 | else | 48 | else |
49 | h += "<rule>#{@rule}</rule>\n" | 49 | h += "<rule>#{@rule}</rule>\n" |
50 | end | 50 | end |
51 | h += "<queryOffset>#{@offset}</queryOffset>\n" | 51 | h += "<queryOffset>#{@offset}</queryOffset>\n" |
52 | h += "<runID>#{@run_id}</runID>\n" | 52 | h += "<runID>#{@run_id}</runID>\n" |
53 | h += "<printQuery>#{@print_query}</printQuery>\n" | 53 | h += "<printQuery>#{@print_query}</printQuery>\n" |
54 | h += "<printDocuments>#{@print_docs}</printDocuments>\n" | 54 | h += "<printDocuments>#{@print_docs}</printDocuments>\n" |
55 | 55 | ||
56 | h | 56 | h |
57 | end | 57 | end |
58 | end | 58 | end |
59 | 59 | ||
60 | class IndriQuery < Query | 60 | class IndriQuery < Query |
61 | attr_accessor :id, :query, :params, :rule | 61 | attr_accessor :id, :query, :params, :rule |
62 | 62 | ||
63 | def initialize(id,query,params) | 63 | def initialize(id,query,params) |
64 | # @params = Parameters === params ? params : Parameters.new(corpus) | ||
65 | @params = params | 64 | @params = params |
66 | # Here we set the default retrieval model as Language Modeling | 65 | # Here we set the default retrieval model as Language Modeling |
67 | # with a Dirichlet smoothing at 2500. | 66 | # with a Dirichlet smoothing at 2500. |
68 | # TODO: maybe a Rule class... | 67 | # TODO: maybe a Rule class... |
69 | @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? | 68 | @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? |
70 | 69 | ||
71 | @id = id | 70 | @id = id |
72 | @query = query | 71 | @query = query |
73 | end | 72 | end |
74 | 73 | ||
75 | def to_s | 74 | def to_s |
76 | h = @params.to_s | 75 | h = @params.to_s |
77 | h += "<query>\n" | 76 | h += "<query>\n" |
78 | h += "<number>#{@id}</number>\n" | 77 | h += "<number>#{@id}</number>\n" |
79 | h += "<text>#{@query}</text>\n" | 78 | h += "<text>#{@query}</text>\n" |
80 | h += "</query>\n" | 79 | h += "</query>\n" |
81 | h += "</parameters>" | 80 | h += "</parameters>" |
82 | 81 | ||
83 | h | 82 | h |
84 | end | 83 | end |
85 | end | 84 | end |
86 | 85 | ||
87 | end | 86 | end |
88 | end | 87 | end |
89 | 88 |
lib/rir/regexp.rb
File was created | 1 | #!/usr/bin/env ruby | |
2 | |||
3 | # This file is a part of an Information Retrieval oriented Ruby library | ||
4 | # | ||
5 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | ||
6 | # | ||
7 | # This program is free software: you can redistribute it and/or modify | ||
8 | # it under the terms of the GNU General Public License as published by | ||
9 | # the Free Software Foundation, either version 3 of the License, or | ||
10 | # (at your option) any later version. | ||
11 | # | ||
12 | # This program is distributed in the hope that it will be useful, | ||
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | # GNU General Public License for more details. | ||
16 | # | ||
17 | # You should have received a copy of the GNU General Public License | ||
18 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
19 | |||
20 | class Regexp | ||
21 | |||
22 | def negated | ||
23 | /^((?!#{self}).)*$/ | ||
24 | end | ||
25 | |||
26 | end | ||
27 |
lib/rir/string.rb
1 | #!/usr/bin/env ruby | 1 | #!/usr/bin/env ruby |
2 | 2 | ||
3 | # This file is a part of an Information Retrieval oriented Ruby library | 3 | # This file is a part of an Information Retrieval oriented Ruby library |
4 | # | 4 | # |
5 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | 5 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> |
6 | # | 6 | # |
7 | # This program is free software: you can redistribute it and/or modify | 7 | # This program is free software: you can redistribute it and/or modify |
8 | # it under the terms of the GNU General Public License as published by | 8 | # it under the terms of the GNU General Public License as published by |
9 | # the Free Software Foundation, either version 3 of the License, or | 9 | # the Free Software Foundation, either version 3 of the License, or |
10 | # (at your option) any later version. | 10 | # (at your option) any later version. |
11 | # | 11 | # |
12 | # This program is distributed in the hope that it will be useful, | 12 | # This program is distributed in the hope that it will be useful, |
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | # GNU General Public License for more details. | 15 | # GNU General Public License for more details. |
16 | # | 16 | # |
17 | # You should have received a copy of the GNU General Public License | 17 | # You should have received a copy of the GNU General Public License |
18 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | 18 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
19 | 19 | ||
20 | # General module for many purposes related to Information Retrieval. | 20 | # General module for many purposes related to Information Retrieval. |
21 | module RIR | 21 | module RIR |
22 | 22 | ||
23 | # These are the default stopwords provided by Lemur. | 23 | # These are the default stopwords provided by Lemur. |
24 | Stoplist = [ | 24 | Stoplist = [ |
25 | "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av", | 25 | "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av", |
26 | "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", | 26 | "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", |
27 | "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", | 27 | "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", |
28 | "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu", | 28 | "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu", |
29 | "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during", | 29 | "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during", |
30 | "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every", | 30 | "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every", |
31 | "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting", | 31 | "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting", |
32 | "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff", | 32 | "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff", |
33 | "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore", | 33 | "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore", |
34 | "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he", | 34 | "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he", |
35 | "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto", | 35 | "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto", |
36 | "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto", | 36 | "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto", |
37 | "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include", | 37 | "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include", |
38 | "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into", | 38 | "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into", |
39 | "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last", | 39 | "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last", |
40 | "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe", | 40 | "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe", |
41 | "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs", | 41 | "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs", |
42 | "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless", | 42 | "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless", |
43 | "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing", | 43 | "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing", |
44 | "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once", | 44 | "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once", |
45 | "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", | 45 | "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", |
46 | "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite", | 46 | "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite", |
47 | "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing", | 47 | "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing", |
48 | "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt", | 48 | "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt", |
49 | "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote", | 49 | "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote", |
50 | "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", | 50 | "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", |
51 | "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave", | 51 | "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave", |
52 | "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them", | 52 | "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them", |
53 | "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts", | 53 | "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts", |
54 | "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", | 54 | "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", |
55 | "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru", | 55 | "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru", |
56 | "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh", | 56 | "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh", |
57 | "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward", | 57 | "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward", |
58 | "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week", | 58 | "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week", |
59 | "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", | 59 | "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", |
60 | "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore", | 60 | "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore", |
61 | "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", | 61 | "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", |
62 | "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever", | 62 | "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever", |
63 | "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom", | 63 | "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom", |
64 | "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within", | 64 | "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within", |
65 | "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your", | 65 | "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your", |
66 | "yours", "yourself", "yourselves" | 66 | "yours", "yourself", "yourselves" |
67 | ] | 67 | ] |
68 | 68 | ||
69 | 69 | ||
70 | end | 70 | end |
71 | 71 | ||
72 | # Extention of the standard class String with useful function. | 72 | # Extention of the standard class String with useful function. |
73 | class String | 73 | class String |
74 | include RIR | 74 | include RIR |
75 | 75 | ||
76 | # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise. | 76 | # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise. |
77 | def is_stopword? | 77 | def is_stopword? |
78 | Stoplist.include?(self.downcase) | 78 | Stoplist.include?(self.downcase) |
79 | end | 79 | end |
80 | 80 | ||
81 | # Do not use. | 81 | # Do not use. |
82 | # TODO: rewamp. find why this function is here. | 82 | # TODO: rewamp. find why this function is here. |
83 | def remove_special_characters | 83 | def remove_special_characters |
84 | self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ') | 84 | self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ') |
85 | end | 85 | end |
86 | 86 | ||
87 | # Removes all XML-like tags from +self+. | 87 | # Removes all XML-like tags from +self+. |
88 | # | 88 | # |
89 | # s = "<html><body>test</body></html>" | 89 | # s = "<html><body>test</body></html>" |
90 | # s.strip_xml_tags! | 90 | # s.strip_xml_tags! |
91 | # s #=> "test" | 91 | # s #=> "test" |
92 | def strip_xml_tags! | 92 | def strip_xml_tags! |
93 | replace strip_with_pattern /<\/?[^>]*>/ | 93 | replace strip_with_pattern /<\/?[^>]*>/ |
94 | end | 94 | end |
95 | 95 | ||
96 | # Removes all XML-like tags from +self+. | 96 | # Removes all XML-like tags from +self+. |
97 | # | 97 | # |
98 | # s = "<html><body>test</body></html>" | 98 | # s = "<html><body>test</body></html>" |
99 | # s.strip_xml_tags #=> "test" | 99 | # s.strip_xml_tags #=> "test" |
100 | # s #=> "<html><body>test</body></html>" | 100 | # s #=> "<html><body>test</body></html>" |
101 | def strip_xml_tags | 101 | def strip_xml_tags |
102 | dup.strip_xml_tags! | 102 | dup.strip_xml_tags! |
103 | end | 103 | end |
104 | 104 | ||
105 | # Removes all Javascript sources from +self+. | 105 | # Removes all Javascript sources from +self+. |
106 | # | 106 | # |
107 | # s = "<script type='text/javascript'> | 107 | # s = "<script type='text/javascript'> |
108 | # var skin='vector', | 108 | # var skin='vector', |
109 | # stylepath='http://bits.wikimedia.org/skins-1.5' | 109 | # stylepath='http://bits.wikimedia.org/skins-1.5' |
110 | # </script> | 110 | # </script> |
111 | # | 111 | # |
112 | # test" | 112 | # test" |
113 | # s.strip_javascripts! | 113 | # s.strip_javascripts! |
114 | # s #=> "test" | 114 | # s #=> "test" |
115 | def strip_javascripts! | 115 | def strip_javascripts! |
116 | replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m | 116 | replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m |
117 | end | 117 | end |
118 | 118 | ||
119 | # Removes all Javascript sources from +self+. | 119 | # Removes all Javascript sources from +self+. |
120 | # | 120 | # |
121 | # s = "<script type='text/javascript'> | 121 | # s = "<script type='text/javascript'> |
122 | # var skin='vector', | 122 | # var skin='vector', |
123 | # stylepath='http://bits.wikimedia.org/skins-1.5' | 123 | # stylepath='http://bits.wikimedia.org/skins-1.5' |
124 | # </script> | 124 | # </script> |
125 | # | 125 | # |
126 | # test" | 126 | # test" |
127 | # s.strip_javascripts #=> "test" | 127 | # s.strip_javascripts #=> "test" |
128 | def strip_javascripts | 128 | def strip_javascripts |
129 | dup.strip_javascripts! | 129 | dup.strip_javascripts! |
130 | end | 130 | end |
131 | 131 | ||
132 | def strip_stylesheets! | 132 | def strip_stylesheets! |
133 | # TODO: rewamp. dunno what is it. | 133 | # TODO: rewamp. dunno what is it. |
134 | replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m | 134 | replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m |
135 | end | 135 | end |
136 | 136 | ||
137 | def strip_stylesheets | 137 | def strip_stylesheets |
138 | dup.strip_stylesheets! | 138 | dup.strip_stylesheets! |
139 | end | 139 | end |
140 | 140 | ||
141 | # Removes punctuation from +self+. | ||
142 | # | ||
143 | # s = "hello, world. how are you?!" | ||
144 | # s.strip_punctuation! | ||
145 | # s # => "hello world how are you" | ||
146 | def strip_punctuation! | ||
147 | replace strip_with_pattern /[^a-zA-Z0-9\-\s]/ | ||
148 | end | ||
149 | |||
150 | # Removes punctuation from +self+. | ||
151 | # | ||
152 | # s = "hello, world. how are you?!" | ||
153 | # s.strip_punctuation # => "hello world how are you" | ||
154 | def strip_punctuation | ||
155 | dup.strip_punctuation! | ||
156 | end | ||
157 | |||
141 | # Returns the text values inside all occurences of a XML tag in +self+ | 158 | # Returns the text values inside all occurences of a XML tag in +self+ |
142 | # | 159 | # |
143 | # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre" | 160 | # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre" |
144 | # s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"] | 161 | # s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"] |
145 | def extract_xmltags_values(tag_name) | 162 | def extract_xmltags_values(tag_name) |
146 | self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten | 163 | self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten |
147 | end | 164 | end |
148 | 165 | ||
149 | def strip_with_pattern(pattern) | 166 | def strip_with_pattern(pattern) |
150 | require 'cgi' | 167 | require 'cgi' |
151 | require 'kconv' | 168 | require 'kconv' |
152 | CGI::unescapeHTML(self.gsub(pattern,"")).toutf8 | 169 | CGI::unescapeHTML(self.gsub(pattern,"")).toutf8 |
153 | end | 170 | end |
154 | 171 | ||
155 | private :strip_with_pattern | 172 | private :strip_with_pattern |
156 | end | 173 | end |
157 | 174 |
main.rb
1 | $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib")) | 1 | $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib")) |
2 | 2 | ||
3 | require 'rir' | 3 | require 'rir' |
4 | 4 | ||
5 | w = RIR::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan") | 5 | w = RIR::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan") |
6 | p w.entropy("guitar") | 6 | p w.entropy("guitar") |
7 | 7 | ||
8 | params = RIR::Indri::Parameters.new("path_vers_mon_index") | 8 | params = RIR::Indri::Parameters.new("path_vers_mon_index") |
9 | p params.rule | ||
10 | q = RIR::Indri::IndriQuery.new("pouet", "bla", params) | 9 | q = RIR::Indri::IndriQuery.new("pouet", "bla", params) |
11 | puts q | 10 | puts q |
11 | |||
12 | c = RIR::Corpus.new "/home/romain/INEX/BookTrack/corpus/" | ||
13 | puts c.files.size |
test/string_test.rb
1 | #!/usr/bin/env ruby | 1 | #!/usr/bin/env ruby |
2 | 2 | ||
3 | require 'test/unit' | 3 | require 'test/unit' |
4 | 4 | ||
5 | require 'string' | 5 | require 'string' |
6 | 6 | ||
7 | class TestString < Test::Unit::TestCase | 7 | class TestString < Test::Unit::TestCase |
8 | 8 | ||
9 | def test_extract_xml | 9 | def test_extract_xml |
10 | s = "four-piece in <a>Indianapolis</a>, <a>Indiana</a> at the Murat Theatre" | 10 | s = "four-piece in <a>Indianapolis</a>, <a>Indiana</a> at the Murat Theatre" |
11 | assert_equal(["Indianapolis", "Indiana"],s.extract_xmltags_values('a')) | 11 | assert_equal(["Indianapolis", "Indiana"],s.extract_xmltags_values('a')) |
12 | end | 12 | end |
13 | 13 | ||
14 | def test_stopword | 14 | def test_stopword |
15 | assert_equal(true, "is".is_stopword?) | 15 | assert_equal(true, "is".is_stopword?) |
16 | assert_equal(true, "seen".is_stopword?) | 16 | assert_equal(true, "seen".is_stopword?) |
17 | assert_equal(false, "totally".is_stopword?) | 17 | assert_equal(false, "totally".is_stopword?) |
18 | assert_equal(false, "Paris".is_stopword?) | 18 | assert_equal(false, "Paris".is_stopword?) |
19 | end | 19 | end |
20 | 20 | ||
21 | def test_strip_xml | 21 | def test_strip_xml |
22 | assert_equal("testme", "<test>testme</test>".strip_xml_tags) | 22 | assert_equal("testme", "<test>testme</test>".strip_xml_tags) |
23 | end | 23 | end |
24 | |||
25 | def test_strip_punctuation | ||
26 | assert_equal("test test test test test test", "test, test. .test, ;test !! ? test ...test./".strip_punctuation) | ||
27 | end | ||
24 | end | 28 | end |
25 | 29 |