Commit 145387519e2023db9ed69bc07a52f1b71b6445fe
1 parent
fd4cb285a4
Exists in
master
new stuff with wikipedia
Showing 8 changed files with 92 additions and 4 deletions Side-by-side Diff
lib/rir.rb
lib/rir/corpus.rb
... | ... | @@ -24,11 +24,17 @@ |
24 | 24 | attr_accessor :path |
25 | 25 | |
26 | 26 | def initialize(path) |
27 | - @path = path | |
27 | + @path = path.chomp "/" | |
28 | 28 | end |
29 | 29 | |
30 | + # Recursively outputs all files in +self.path+. | |
31 | + # WARNING ! This function may take a lot of time if many | |
32 | + # files are in subdirectories. | |
33 | + # | |
34 | + # c = Corpus.new "my/path" | |
35 | + # c.files # => ["README.txt", "lib/code.rb"] | |
30 | 36 | def files |
31 | - Dir.glob("**/*.*") | |
37 | + Dir["#{@path}/**/*.*"] | |
32 | 38 | end |
33 | 39 | end |
34 | 40 |
lib/rir/document.rb
... | ... | @@ -117,6 +117,38 @@ |
117 | 117 | |
118 | 118 | # A WikipediaPage is a WebDocument. |
119 | 119 | class WikipediaPage < WebDocument |
120 | + require 'rexml/document' | |
121 | + require 'net/http' | |
122 | + require 'kconv' | |
123 | + | |
124 | + | |
125 | + def self.search_wikipedia_titles(name) | |
126 | + res = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml")).toutf8).elements['api/query/search'] | |
127 | + | |
128 | + res.collect { |e| e.attributes['title'] } unless res.nil? | |
129 | + end | |
130 | + | |
131 | + def self.get_url(name) | |
132 | + atts = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml")).toutf8).elements['api/query/pages/page'].attributes | |
133 | + | |
134 | + atts['fullurl'] if atts['missing'].nil? | |
135 | + end | |
136 | + | |
137 | + def self.search_homepage(name) | |
138 | + title = WikipediaPage.search_wikipedia_titles name | |
139 | + | |
140 | + begin | |
141 | + WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? | |
142 | + rescue | |
143 | + puts title[0] | |
144 | + end | |
145 | + end | |
146 | + | |
147 | +# def initialize(name) | |
148 | +# title = WikipediaPage.search_wikipedia_titles name | |
149 | +# raise ArgumentError, "No page found" if title.empty? | |
150 | +# super WikipediaPage.get_url title[0] | |
151 | +# end | |
120 | 152 | end |
121 | 153 | end |
lib/rir/query.rb
... | ... | @@ -61,7 +61,6 @@ |
61 | 61 | attr_accessor :id, :query, :params, :rule |
62 | 62 | |
63 | 63 | def initialize(id,query,params) |
64 | -# @params = Parameters === params ? params : Parameters.new(corpus) | |
65 | 64 | @params = params |
66 | 65 | # Here we set the default retrieval model as Language Modeling |
67 | 66 | # with a Dirichlet smoothing at 2500. |
lib/rir/regexp.rb
1 | +#!/usr/bin/env ruby | |
2 | + | |
3 | +# This file is a part of an Information Retrieval oriented Ruby library | |
4 | +# | |
5 | +# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | |
6 | +# | |
7 | +# This program is free software: you can redistribute it and/or modify | |
8 | +# it under the terms of the GNU General Public License as published by | |
9 | +# the Free Software Foundation, either version 3 of the License, or | |
10 | +# (at your option) any later version. | |
11 | +# | |
12 | +# This program is distributed in the hope that it will be useful, | |
13 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | +# GNU General Public License for more details. | |
16 | +# | |
17 | +# You should have received a copy of the GNU General Public License | |
18 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
19 | + | |
20 | +class Regexp | |
21 | + | |
22 | + def negated | |
23 | + /^((?!#{self}).)*$/ | |
24 | + end | |
25 | + | |
26 | +end |
lib/rir/string.rb
... | ... | @@ -138,6 +138,23 @@ |
138 | 138 | dup.strip_stylesheets! |
139 | 139 | end |
140 | 140 | |
141 | + # Removes punctuation from +self+. | |
142 | + # | |
143 | + # s = "hello, world. how are you?!" | |
144 | + # s.strip_punctuation! | |
145 | + # s # => "hello world how are you" | |
146 | + def strip_punctuation! | |
147 | + replace strip_with_pattern /[^a-zA-Z0-9\-\s]/ | |
148 | + end | |
149 | + | |
150 | + # Removes punctuation from +self+. | |
151 | + # | |
152 | + # s = "hello, world. how are you?!" | |
153 | + # s.strip_punctuation # => "hello world how are you" | |
154 | + def strip_punctuation | |
155 | + dup.strip_punctuation! | |
156 | + end | |
157 | + | |
141 | 158 | # Returns the text values inside all occurences of a XML tag in +self+ |
142 | 159 | # |
143 | 160 | # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre" |
main.rb
test/string_test.rb
... | ... | @@ -21,5 +21,9 @@ |
21 | 21 | def test_strip_xml |
22 | 22 | assert_equal("testme", "<test>testme</test>".strip_xml_tags) |
23 | 23 | end |
24 | + | |
25 | + def test_strip_punctuation | |
26 | + assert_equal("test test test test test test", "test, test. .test, ;test !! ? test ...test./".strip_punctuation) | |
27 | + end | |
24 | 28 | end |