Commit 145387519e2023db9ed69bc07a52f1b71b6445fe

Authored by Romain Deveaud
1 parent fd4cb285a4
Exists in master

new stuff with wikipedia

Showing 8 changed files with 92 additions and 4 deletions Side-by-side Diff

... ... @@ -3,4 +3,6 @@
3 3 require 'rir/document'
4 4 require 'rir/string'
5 5 require 'rir/query'
  6 +require 'rir/corpus'
  7 +require 'rir/regexp'
... ... @@ -24,11 +24,17 @@
24 24 attr_accessor :path
25 25  
26 26 def initialize(path)
27   - @path = path
  27 + @path = path.chomp "/"
28 28 end
29 29  
  30 + # Recursively outputs all files in +self.path+.
  31 + # WARNING ! This function may take a lot of time if many
  32 + # files are in subdirectories.
  33 + #
  34 + # c = Corpus.new "my/path"
  35 + # c.files # => ["README.txt", "lib/code.rb"]
30 36 def files
31   - Dir.glob("**/*.*")
  37 + Dir["#{@path}/**/*.*"]
32 38 end
33 39 end
34 40  
... ... @@ -117,6 +117,38 @@
117 117  
118 118 # A WikipediaPage is a WebDocument.
119 119 class WikipediaPage < WebDocument
  120 + require 'rexml/document'
  121 + require 'net/http'
  122 + require 'kconv'
  123 +
  124 +
  125 + def self.search_wikipedia_titles(name)
  126 + res = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml")).toutf8).elements['api/query/search']
  127 +
  128 + res.collect { |e| e.attributes['title'] } unless res.nil?
  129 + end
  130 +
  131 + def self.get_url(name)
  132 + atts = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml")).toutf8).elements['api/query/pages/page'].attributes
  133 +
  134 + atts['fullurl'] if atts['missing'].nil?
  135 + end
  136 +
  137 + def self.search_homepage(name)
  138 + title = WikipediaPage.search_wikipedia_titles name
  139 +
  140 + begin
  141 + WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?
  142 + rescue
  143 + puts title[0]
  144 + end
  145 + end
  146 +
  147 +# def initialize(name)
  148 +# title = WikipediaPage.search_wikipedia_titles name
  149 +# raise ArgumentError, "No page found" if title.empty?
  150 +# super WikipediaPage.get_url title[0]
  151 +# end
120 152 end
121 153 end
... ... @@ -61,7 +61,6 @@
61 61 attr_accessor :id, :query, :params, :rule
62 62  
63 63 def initialize(id,query,params)
64   -# @params = Parameters === params ? params : Parameters.new(corpus)
65 64 @params = params
66 65 # Here we set the default retrieval model as Language Modeling
67 66 # with a Dirichlet smoothing at 2500.
  1 +#!/usr/bin/env ruby
  2 +
  3 +# This file is a part of an Information Retrieval oriented Ruby library
  4 +#
  5 +# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
  6 +#
  7 +# This program is free software: you can redistribute it and/or modify
  8 +# it under the terms of the GNU General Public License as published by
  9 +# the Free Software Foundation, either version 3 of the License, or
  10 +# (at your option) any later version.
  11 +#
  12 +# This program is distributed in the hope that it will be useful,
  13 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 +# GNU General Public License for more details.
  16 +#
  17 +# You should have received a copy of the GNU General Public License
  18 +# along with this program. If not, see <http://www.gnu.org/licenses/>.
  19 +
  20 +class Regexp
  21 +
  22 + def negated
  23 + /^((?!#{self}).)*$/
  24 + end
  25 +
  26 +end
... ... @@ -138,6 +138,23 @@
138 138 dup.strip_stylesheets!
139 139 end
140 140  
  141 + # Removes punctuation from +self+.
  142 + #
  143 + # s = "hello, world. how are you?!"
  144 + # s.strip_punctuation!
  145 + # s # => "hello world how are you"
  146 + def strip_punctuation!
  147 + replace strip_with_pattern /[^a-zA-Z0-9\-\s]/
  148 + end
  149 +
  150 + # Removes punctuation from +self+.
  151 + #
  152 + # s = "hello, world. how are you?!"
  153 + # s.strip_punctuation # => "hello world how are you"
  154 + def strip_punctuation
  155 + dup.strip_punctuation!
  156 + end
  157 +
141 158 # Returns the text values inside all occurences of a XML tag in +self+
142 159 #
143 160 # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre"
... ... @@ -6,7 +6,9 @@
6 6 p w.entropy("guitar")
7 7  
8 8 params = RIR::Indri::Parameters.new("path_vers_mon_index")
9   -p params.rule
10 9 q = RIR::Indri::IndriQuery.new("pouet", "bla", params)
11 10 puts q
  11 +
  12 +c = RIR::Corpus.new "/home/romain/INEX/BookTrack/corpus/"
  13 +puts c.files.size
... ... @@ -21,5 +21,9 @@
21 21 def test_strip_xml
22 22 assert_equal("testme", "<test>testme</test>".strip_xml_tags)
23 23 end
  24 +
  25 + def test_strip_punctuation
  26 + assert_equal("test test test test test test", "test, test. .test, ;test !! ? test ...test./".strip_punctuation)
  27 + end
24 28 end