new stuff with wikipedia

Romain Deveaud
1 parent fd4cb285a4
Showing 8 changed files with 92 additions and 4 deletions Side-by-side Diff
lib/rir.rb
lib/rir/corpus.rb
lib/rir/document.rb
lib/rir/query.rb
lib/rir/regexp.rb
lib/rir/string.rb
main.rb
test/string_test.rb
@@ -3,4 +3,6 @@
 require 'rir/document'
 require 'rir/string'
 require 'rir/query'
+require 'rir/corpus'
+require 'rir/regexp'
@@ -24,11 +24,17 @@
     attr_accessor :path
  
     def initialize(path)
-      @path = path
+      @path = path.chomp "/"
     end
  
+    # Recursively outputs all files in +self.path+.
+    # WARNING ! This function may take a lot of time if many
+    # files are in subdirectories.
+    #
+    #   c = Corpus.new "my/path"
+    #   c.files                  # => ["README.txt", "lib/code.rb"]
     def files
-      Dir.glob("**/*.*")
+      Dir["#{@path}/**/*.*"]
     end
   end
  
@@ -117,6 +117,38 @@
  
   # A WikipediaPage is a WebDocument.
   class WikipediaPage < WebDocument
+    require 'rexml/document'
+    require 'net/http'
+    require 'kconv'
+
+
+    def self.search_wikipedia_titles(name)
+      res = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml")).toutf8).elements['api/query/search']
+
+      res.collect { |e| e.attributes['title'] } unless res.nil?
+    end
+
+    def self.get_url(name)
+      atts = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml")).toutf8).elements['api/query/pages/page'].attributes
+
+      atts['fullurl'] if atts['missing'].nil?
+    end
+
+    def self.search_homepage(name)
+      title = WikipediaPage.search_wikipedia_titles name
+
+      begin
+        WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?
+      rescue
+        puts title[0]
+      end
+    end
+
+#    def initialize(name)
+#      title = WikipediaPage.search_wikipedia_titles name
+#      raise ArgumentError, "No page found" if title.empty? 
+#      super WikipediaPage.get_url title[0]
+#    end
   end
 end
@@ -61,7 +61,6 @@
       attr_accessor :id, :query, :params, :rule
  
       def initialize(id,query,params)
-#        @params = Parameters === params ? params : Parameters.new(corpus)
         @params = params
         # Here we set the default retrieval model as Language Modeling
         # with a Dirichlet smoothing at 2500.
+#!/usr/bin/env ruby
+
+# This file is a part of an Information Retrieval oriented Ruby library
+#
+# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+class Regexp
+
+  def negated
+    /^((?!#{self}).)*$/
+  end
+
+end
@@ -138,6 +138,23 @@
     dup.strip_stylesheets!
   end
  
+  # Removes punctuation from +self+.
+  #
+  #   s = "hello, world. how are you?!"
+  #   s.strip_punctuation!
+  #   s                                 # => "hello world how are you"
+  def strip_punctuation!
+    replace strip_with_pattern /[^a-zA-Z0-9\-\s]/
+  end
+
+  # Removes punctuation from +self+.
+  #
+  #   s = "hello, world. how are you?!"
+  #   s.strip_punctuation               # => "hello world how are you"
+  def strip_punctuation
+    dup.strip_punctuation!
+  end
+
   # Returns the text values inside all occurences of a XML tag in +self+
   #
   #   s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre"
@@ -6,7 +6,9 @@
 p w.entropy("guitar")
  
 params = RIR::Indri::Parameters.new("path_vers_mon_index")
-p params.rule
 q = RIR::Indri::IndriQuery.new("pouet", "bla", params)
 puts q
+
+c = RIR::Corpus.new "/home/romain/INEX/BookTrack/corpus/"
+puts c.files.size
@@ -21,5 +21,9 @@
   def test_strip_xml
     assert_equal("testme", "<test>testme</test>".strip_xml_tags)
   end
+
+  def test_strip_punctuation
+    assert_equal("test test test test   test test", "test, test. .test, ;test !! ? test ...test./".strip_punctuation)
+  end
 end
...	...	@@ -3,4 +3,6 @@
3	3	require 'rir/document'
4	4	require 'rir/string'
5	5	require 'rir/query'
	6	+require 'rir/corpus'
	7	+require 'rir/regexp'
...	...	@@ -24,11 +24,17 @@
24	24	attr_accessor :path
25	25
26	26	def initialize(path)
27		- @path = path
	27	+ @path = path.chomp "/"
28	28	end
29	29
	30	+ # Recursively outputs all files in +self.path+.
	31	+ # WARNING ! This function may take a lot of time if many
	32	+ # files are in subdirectories.
	33	+ #
	34	+ # c = Corpus.new "my/path"
	35	+ # c.files # => ["README.txt", "lib/code.rb"]
30	36	def files
31		- Dir.glob("*/.*")
	37	+ Dir["#{@path}/*/.*"]
32	38	end
33	39	end
34	40
...	...	@@ -117,6 +117,38 @@
117	117
118	118	# A WikipediaPage is a WebDocument.
119	119	class WikipediaPage < WebDocument
	120	+ require 'rexml/document'
	121	+ require 'net/http'
	122	+ require 'kconv'
	123	+
	124	+
	125	+ def self.search_wikipedia_titles(name)
	126	+ res = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml")).toutf8).elements['api/query/search']
	127	+
	128	+ res.collect { \|e\| e.attributes['title'] } unless res.nil?
	129	+ end
	130	+
	131	+ def self.get_url(name)
	132	+ atts = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml")).toutf8).elements['api/query/pages/page'].attributes
	133	+
	134	+ atts['fullurl'] if atts['missing'].nil?
	135	+ end
	136	+
	137	+ def self.search_homepage(name)
	138	+ title = WikipediaPage.search_wikipedia_titles name
	139	+
	140	+ begin
	141	+ WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? \|\| title.empty?
	142	+ rescue
	143	+ puts title[0]
	144	+ end
	145	+ end
	146	+
	147	+# def initialize(name)
	148	+# title = WikipediaPage.search_wikipedia_titles name
	149	+# raise ArgumentError, "No page found" if title.empty?
	150	+# super WikipediaPage.get_url title[0]
	151	+# end
120	152	end
121	153	end
...	...	@@ -61,7 +61,6 @@
61	61	attr_accessor :id, :query, :params, :rule
62	62
63	63	def initialize(id,query,params)
64		-# @params = Parameters === params ? params : Parameters.new(corpus)
65	64	@params = params
66	65	# Here we set the default retrieval model as Language Modeling
67	66	# with a Dirichlet smoothing at 2500.
	1	+#!/usr/bin/env ruby
	2	+
	3	+# This file is a part of an Information Retrieval oriented Ruby library
	4	+#
	5	+# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
	6	+#
	7	+# This program is free software: you can redistribute it and/or modify
	8	+# it under the terms of the GNU General Public License as published by
	9	+# the Free Software Foundation, either version 3 of the License, or
	10	+# (at your option) any later version.
	11	+#
	12	+# This program is distributed in the hope that it will be useful,
	13	+# but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	+# GNU General Public License for more details.
	16	+#
	17	+# You should have received a copy of the GNU General Public License
	18	+# along with this program. If not, see <http://www.gnu.org/licenses/>.
	19	+
	20	+class Regexp
	21	+
	22	+ def negated
	23	+ /^((?!#{self}).)*$/
	24	+ end
	25	+
	26	+end
...	...	@@ -138,6 +138,23 @@
138	138	dup.strip_stylesheets!
139	139	end
140	140
	141	+ # Removes punctuation from +self+.
	142	+ #
	143	+ # s = "hello, world. how are you?!"
	144	+ # s.strip_punctuation!
	145	+ # s # => "hello world how are you"
	146	+ def strip_punctuation!
	147	+ replace strip_with_pattern /[^a-zA-Z0-9\-\s]/
	148	+ end
	149	+
	150	+ # Removes punctuation from +self+.
	151	+ #
	152	+ # s = "hello, world. how are you?!"
	153	+ # s.strip_punctuation # => "hello world how are you"
	154	+ def strip_punctuation
	155	+ dup.strip_punctuation!
	156	+ end
	157	+
141	158	# Returns the text values inside all occurences of a XML tag in +self+
142	159	#
143	160	# s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre"
...	...	@@ -6,7 +6,9 @@
6	6	p w.entropy("guitar")
7	7
8	8	params = RIR::Indri::Parameters.new("path_vers_mon_index")
9		-p params.rule
10	9	q = RIR::Indri::IndriQuery.new("pouet", "bla", params)
11	10	puts q
	11	+
	12	+c = RIR::Corpus.new "/home/romain/INEX/BookTrack/corpus/"
	13	+puts c.files.size
...	...	@@ -21,5 +21,9 @@
21	21	def test_strip_xml
22	22	assert_equal("testme", "<test>testme</test>".strip_xml_tags)
23	23	end
	24	+
	25	+ def test_strip_punctuation
	26	+ assert_equal("test test test test test test", "test, test. .test, ;test !! ? test ...test./".strip_punctuation)
	27	+ end
24	28	end