Commit 145387519e2023db9ed69bc07a52f1b71b6445fe
1 parent
fd4cb285a4
Exists in
master
new stuff with wikipedia
Showing 8 changed files with 92 additions and 4 deletions Side-by-side Diff
lib/rir.rb
lib/rir/corpus.rb
| ... | ... | @@ -24,11 +24,17 @@ |
| 24 | 24 | attr_accessor :path |
| 25 | 25 | |
| 26 | 26 | def initialize(path) |
| 27 | - @path = path | |
| 27 | + @path = path.chomp "/" | |
| 28 | 28 | end |
| 29 | 29 | |
| 30 | + # Recursively outputs all files in +self.path+. | |
| 31 | + # WARNING ! This function may take a lot of time if many | |
| 32 | + # files are in subdirectories. | |
| 33 | + # | |
| 34 | + # c = Corpus.new "my/path" | |
| 35 | + # c.files # => ["README.txt", "lib/code.rb"] | |
| 30 | 36 | def files |
| 31 | - Dir.glob("**/*.*") | |
| 37 | + Dir["#{@path}/**/*.*"] | |
| 32 | 38 | end |
| 33 | 39 | end |
| 34 | 40 |
lib/rir/document.rb
| ... | ... | @@ -117,6 +117,38 @@ |
| 117 | 117 | |
| 118 | 118 | # A WikipediaPage is a WebDocument. |
| 119 | 119 | class WikipediaPage < WebDocument |
| 120 | + require 'rexml/document' | |
| 121 | + require 'net/http' | |
| 122 | + require 'kconv' | |
| 123 | + | |
| 124 | + | |
| 125 | + def self.search_wikipedia_titles(name) | |
| 126 | + res = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml")).toutf8).elements['api/query/search'] | |
| 127 | + | |
| 128 | + res.collect { |e| e.attributes['title'] } unless res.nil? | |
| 129 | + end | |
| 130 | + | |
| 131 | + def self.get_url(name) | |
| 132 | + atts = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml")).toutf8).elements['api/query/pages/page'].attributes | |
| 133 | + | |
| 134 | + atts['fullurl'] if atts['missing'].nil? | |
| 135 | + end | |
| 136 | + | |
| 137 | + def self.search_homepage(name) | |
| 138 | + title = WikipediaPage.search_wikipedia_titles name | |
| 139 | + | |
| 140 | + begin | |
| 141 | + WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? | |
| 142 | + rescue | |
| 143 | + puts title[0] | |
| 144 | + end | |
| 145 | + end | |
| 146 | + | |
| 147 | +# def initialize(name) | |
| 148 | +# title = WikipediaPage.search_wikipedia_titles name | |
| 149 | +# raise ArgumentError, "No page found" if title.empty? | |
| 150 | +# super WikipediaPage.get_url title[0] | |
| 151 | +# end | |
| 120 | 152 | end |
| 121 | 153 | end |
lib/rir/query.rb
| ... | ... | @@ -61,7 +61,6 @@ |
| 61 | 61 | attr_accessor :id, :query, :params, :rule |
| 62 | 62 | |
| 63 | 63 | def initialize(id,query,params) |
| 64 | -# @params = Parameters === params ? params : Parameters.new(corpus) | |
| 65 | 64 | @params = params |
| 66 | 65 | # Here we set the default retrieval model as Language Modeling |
| 67 | 66 | # with a Dirichlet smoothing at 2500. |
lib/rir/regexp.rb
| 1 | +#!/usr/bin/env ruby | |
| 2 | + | |
| 3 | +# This file is a part of an Information Retrieval oriented Ruby library | |
| 4 | +# | |
| 5 | +# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | |
| 6 | +# | |
| 7 | +# This program is free software: you can redistribute it and/or modify | |
| 8 | +# it under the terms of the GNU General Public License as published by | |
| 9 | +# the Free Software Foundation, either version 3 of the License, or | |
| 10 | +# (at your option) any later version. | |
| 11 | +# | |
| 12 | +# This program is distributed in the hope that it will be useful, | |
| 13 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 14 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 15 | +# GNU General Public License for more details. | |
| 16 | +# | |
| 17 | +# You should have received a copy of the GNU General Public License | |
| 18 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 19 | + | |
| 20 | +class Regexp | |
| 21 | + | |
| 22 | + def negated | |
| 23 | + /^((?!#{self}).)*$/ | |
| 24 | + end | |
| 25 | + | |
| 26 | +end |
lib/rir/string.rb
| ... | ... | @@ -138,6 +138,23 @@ |
| 138 | 138 | dup.strip_stylesheets! |
| 139 | 139 | end |
| 140 | 140 | |
| 141 | + # Removes punctuation from +self+. | |
| 142 | + # | |
| 143 | + # s = "hello, world. how are you?!" | |
| 144 | + # s.strip_punctuation! | |
| 145 | + # s # => "hello world how are you" | |
| 146 | + def strip_punctuation! | |
| 147 | + replace strip_with_pattern /[^a-zA-Z0-9\-\s]/ | |
| 148 | + end | |
| 149 | + | |
| 150 | + # Removes punctuation from +self+. | |
| 151 | + # | |
| 152 | + # s = "hello, world. how are you?!" | |
| 153 | + # s.strip_punctuation # => "hello world how are you" | |
| 154 | + def strip_punctuation | |
| 155 | + dup.strip_punctuation! | |
| 156 | + end | |
| 157 | + | |
| 141 | 158 | # Returns the text values inside all occurences of a XML tag in +self+ |
| 142 | 159 | # |
| 143 | 160 | # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre" |
main.rb
test/string_test.rb
| ... | ... | @@ -21,5 +21,9 @@ |
| 21 | 21 | def test_strip_xml |
| 22 | 22 | assert_equal("testme", "<test>testme</test>".strip_xml_tags) |
| 23 | 23 | end |
| 24 | + | |
| 25 | + def test_strip_punctuation | |
| 26 | + assert_equal("test test test test test test", "test, test. .test, ;test !! ? test ...test./".strip_punctuation) | |
| 27 | + end | |
| 24 | 28 | end |