diff --git a/lib/rir.rb b/lib/rir.rb index 0f336e0..0b27852 100644 --- a/lib/rir.rb +++ b/lib/rir.rb @@ -3,3 +3,5 @@ require 'rir/document' require 'rir/string' require 'rir/query' +require 'rir/corpus' +require 'rir/regexp' diff --git a/lib/rir/corpus.rb b/lib/rir/corpus.rb index 44d2f3f..f443ec4 100644 --- a/lib/rir/corpus.rb +++ b/lib/rir/corpus.rb @@ -24,11 +24,17 @@ module RIR attr_accessor :path def initialize(path) - @path = path + @path = path.chomp "/" end + # Recursively outputs all files in +self.path+. + # WARNING ! This function may take a lot of time if many + # files are in subdirectories. + # + # c = Corpus.new "my/path" + # c.files # => ["README.txt", "lib/code.rb"] def files - Dir.glob("**/*.*") + Dir["#{@path}/**/*.*"] end end diff --git a/lib/rir/document.rb b/lib/rir/document.rb index 87a5c28..9bd05ae 100644 --- a/lib/rir/document.rb +++ b/lib/rir/document.rb @@ -117,5 +117,37 @@ module RIR # A WikipediaPage is a WebDocument. class WikipediaPage < WebDocument + require 'rexml/document' + require 'net/http' + require 'kconv' + + + def self.search_wikipedia_titles(name) + res = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml")).toutf8).elements['api/query/search'] + + res.collect { |e| e.attributes['title'] } unless res.nil? + end + + def self.get_url(name) + atts = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml")).toutf8).elements['api/query/pages/page'].attributes + + atts['fullurl'] if atts['missing'].nil? + end + + def self.search_homepage(name) + title = WikipediaPage.search_wikipedia_titles name + + begin + WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? + rescue + puts title[0] + end + end + +# def initialize(name) +# title = WikipediaPage.search_wikipedia_titles name +# raise ArgumentError, "No page found" if title.empty? +# super WikipediaPage.get_url title[0] +# end end end diff --git a/lib/rir/query.rb b/lib/rir/query.rb index 63ca4ca..581901e 100644 --- a/lib/rir/query.rb +++ b/lib/rir/query.rb @@ -61,7 +61,6 @@ module RIR attr_accessor :id, :query, :params, :rule def initialize(id,query,params) -# @params = Parameters === params ? params : Parameters.new(corpus) @params = params # Here we set the default retrieval model as Language Modeling # with a Dirichlet smoothing at 2500. diff --git a/lib/rir/regexp.rb b/lib/rir/regexp.rb new file mode 100644 index 0000000..dc718b9 --- /dev/null +++ b/lib/rir/regexp.rb @@ -0,0 +1,26 @@ +#!/usr/bin/env ruby + +# This file is a part of an Information Retrieval oriented Ruby library +# +# Copyright (C) 2010-2011 Romain Deveaud +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +class Regexp + + def negated + /^((?!#{self}).)*$/ + end + +end diff --git a/lib/rir/string.rb b/lib/rir/string.rb index cbf4c23..7a95f5c 100644 --- a/lib/rir/string.rb +++ b/lib/rir/string.rb @@ -138,6 +138,23 @@ class String dup.strip_stylesheets! end + # Removes punctuation from +self+. + # + # s = "hello, world. how are you?!" + # s.strip_punctuation! + # s # => "hello world how are you" + def strip_punctuation! + replace strip_with_pattern /[^a-zA-Z0-9\-\s]/ + end + + # Removes punctuation from +self+. + # + # s = "hello, world. how are you?!" + # s.strip_punctuation # => "hello world how are you" + def strip_punctuation + dup.strip_punctuation! + end + # Returns the text values inside all occurences of a XML tag in +self+ # # s = "four-piece in Indianapolis, Indiana at the Murat Theatre" diff --git a/main.rb b/main.rb index 0ea2e47..f79f17c 100644 --- a/main.rb +++ b/main.rb @@ -6,6 +6,8 @@ w = RIR::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Pl p w.entropy("guitar") params = RIR::Indri::Parameters.new("path_vers_mon_index") -p params.rule q = RIR::Indri::IndriQuery.new("pouet", "bla", params) puts q + +c = RIR::Corpus.new "/home/romain/INEX/BookTrack/corpus/" +puts c.files.size diff --git a/test/string_test.rb b/test/string_test.rb index 55bbaa4..b01b9be 100644 --- a/test/string_test.rb +++ b/test/string_test.rb @@ -21,4 +21,8 @@ class TestString < Test::Unit::TestCase def test_strip_xml assert_equal("testme", "testme".strip_xml_tags) end + + def test_strip_punctuation + assert_equal("test test test test test test", "test, test. .test, ;test !! ? test ...test./".strip_punctuation) + end end