Deveaud Romain / mirimiri

Browse Code »

Commit 145387519e2023db9ed69bc07a52f1b71b6445fe

Authored by Romain Deveaud 2010-11-22 18:36:38 +0100

1 parent fd4cb285a4

Exists in master

new stuff with wikipedia

Showing 8 changed files with 92 additions and 4 deletions Inline Diff

lib/rir.rb
lib/rir/corpus.rb
lib/rir/document.rb
lib/rir/query.rb
lib/rir/regexp.rb
lib/rir/string.rb
main.rb
test/string_test.rb

lib/rir.rb

Diff comments View file @ 1453875

1	#!/usr/bin/env ruby	1	#!/usr/bin/env ruby
2		2
3	require 'rir/document'	3	require 'rir/document'
4	require 'rir/string'	4	require 'rir/string'
5	require 'rir/query'	5	require 'rir/query'
		6	require 'rir/corpus'
		7	require 'rir/regexp'
6		8

lib/rir/corpus.rb

Diff comments View file @ 1453875

 #!/usr/bin/env ruby
 # This file is a part of an Information Retrieval oriented Ruby library
 #
 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 # General module for many purposes related to Information Retrieval.
 module RIR
   class Corpus
     attr_accessor :path
     def initialize(path)
-      @path = path
+      @path = path.chomp "/"
     end
+    # Recursively outputs all files in +self.path+.
+    # WARNING ! This function may take a lot of time if many
+    # files are in subdirectories.
+    #
+    #   c = Corpus.new "my/path"
+    #   c.files                  # => ["README.txt", "lib/code.rb"]
     def files
-      Dir.glob("**/*.*")
+      Dir["#{@path}/**/*.*"]
     end
   end
 end

lib/rir/document.rb

Diff comments View file @ 1453875

 #!/usr/bin/env ruby
 # This file is a part of an Information Retrieval oriented Ruby library
 #
 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 # General module for many purposes related to Information Retrieval.
 module RIR
   # A Document is a bag of words and is constructed from a string.
   class Document
     attr_reader :words, :doc_content
     # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
     # and the \\W special escape).
     #
     # Protected function, only meant to by called at the initialization.
     def format_words
       wo = []
       @doc_content.split.each do |w|
         w.split(/\W/).each do |sw|
           wo.push(sw) if sw =~ /[a-zA-Z]/
         end
       end
       wo
     end
     # Returns an Array containing the +n+-grams (words) from the current Document.
     #
     #   ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
     def ngrams(n)
       window       = []
       ngrams_array = []
       @words.each do |w|
         window.push(w)
         if window.size == n
           ngrams_array.push window.join(" ")
           window.delete_at(0)
         end
       end
       ngrams_array.uniq
     end
     # Returns a Hash containing the words and their associated counts in the current Document.
     #
     #   count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
     def count_words
       counts = Hash.new { |h,k| h[k] = 0 }
       @words.each { |w| counts[w.downcase] += 1 }
       counts
     end
     # Computes the entropy of a given string +s+ inside the document.
     #
     # If the string parameter is composed of many words (i.e. tokens separated
     # by whitespace(s)), it is considered as an ngram.
     #
     #   entropy("guitar") #=> 0.00389919463243839
     def entropy(s)
       en = 0.0
       counts = self.count_words
       s.split.each do |w|
         p_wi = counts[w].to_f/@words.count.to_f
         en += p_wi*Math.log2(p_wi)
       end
       en *= -1
       en
     end
     def initialize(content)
       @doc_content = content
       @words = format_words
     end
     protected :format_words
   end
   # A WebDocument is a Document with a +url+.
   class WebDocument < Document
     attr_reader :url
     # Returns the HTML text from the page of a given +url+.
     def self.get_content(url)
       require 'net/http'
       Net::HTTP.get(URI.parse(url))
     end
     # WebDocument constructor, the content of the Document is the HTML page
     # without the tags.
     def initialize(url)
       @url = url
       super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags
     end
   end
   # A WikipediaPage is a WebDocument.
   class WikipediaPage < WebDocument
+    require 'rexml/document'
+    require 'net/http'
+    require 'kconv'
+    def self.search_wikipedia_titles(name)
+      res = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml")).toutf8).elements['api/query/search']
+      res.collect { |e| e.attributes['title'] } unless res.nil?
+    end
+    def self.get_url(name)
+      atts = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml")).toutf8).elements['api/query/pages/page'].attributes
+      atts['fullurl'] if atts['missing'].nil?
+    end
+    def self.search_homepage(name)
+      title = WikipediaPage.search_wikipedia_titles name
+      begin
+        WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?
+      rescue
+        puts title[0]
+      end
+    end
+#    def initialize(name)
+#      title = WikipediaPage.search_wikipedia_titles name
+#      raise ArgumentError, "No page found" if title.empty?
+#      super WikipediaPage.get_url title[0]
+#    end
   end
 end

lib/rir/query.rb

Diff comments View file @ 1453875

 #!/usr/bin/env ruby
 # This file is a part of an Information Retrieval oriented Ruby library
 #
 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 # General module for many purposes related to Information Retrieval.
 module RIR
   class Query
   end
   module Indri
     class Parameters
       attr_accessor :corpus, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline
       def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false)
         @corpus      = corpus
         @memory      = mem
         @count       = count
         @offset      = offset
         @run_id      = run_id
         @print_query = print_query ? "true" : "false"
         @print_docs  = print_docs  ? "true" : "false"
       end
       def to_s
         h = "<parameters>\n"
         h += "<memory>#{@memory}</memory>\n"
         h += "<index>#{@corpus}</index>\n"
         h += "<count>#{@count}</count>\n"
         unless @baseline.nil?
           h += "<baseline>#{@baseline}</baseline>\n"
         else
           h += "<rule>#{@rule}</rule>\n"
         end
         h += "<queryOffset>#{@offset}</queryOffset>\n"
         h += "<runID>#{@run_id}</runID>\n"
         h += "<printQuery>#{@print_query}</printQuery>\n"
         h += "<printDocuments>#{@print_docs}</printDocuments>\n"
         h
       end
     end
     class IndriQuery < Query
       attr_accessor :id, :query, :params, :rule
       def initialize(id,query,params)
-#        @params = Parameters === params ? params : Parameters.new(corpus)
         @params = params
         # Here we set the default retrieval model as Language Modeling
         # with a Dirichlet smoothing at 2500.
         # TODO: maybe a Rule class...
         @params.rule  = 'method:dirichlet,mu:2500' if @params.rule.nil?
         @id     = id
         @query  = query
       end
       def to_s
         h = @params.to_s
         h += "<query>\n"
         h += "<number>#{@id}</number>\n"
         h += "<text>#{@query}</text>\n"
         h += "</query>\n"
         h += "</parameters>"
         h
       end
     end
   end
 end

lib/rir/regexp.rb

Diff comments View file @ 1453875

File was created	1	#!/usr/bin/env ruby
	2
	3	# This file is a part of an Information Retrieval oriented Ruby library
	4	#
	5	# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
	6	#
	7	# This program is free software: you can redistribute it and/or modify
	8	# it under the terms of the GNU General Public License as published by
	9	# the Free Software Foundation, either version 3 of the License, or
	10	# (at your option) any later version.
	11	#
	12	# This program is distributed in the hope that it will be useful,
	13	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	15	# GNU General Public License for more details.
	16	#
	17	# You should have received a copy of the GNU General Public License
	18	# along with this program. If not, see <http://www.gnu.org/licenses/>.
	19
	20	class Regexp
	21
	22	def negated
	23	/^((?!#{self}).)*$/
	24	end
	25
	26	end
	27

lib/rir/string.rb

Diff comments View file @ 1453875

 #!/usr/bin/env ruby
 # This file is a part of an Information Retrieval oriented Ruby library
 #
 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 # General module for many purposes related to Information Retrieval.
 module RIR
   # These are the default stopwords provided by Lemur.
   Stoplist = [
   "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av",
   "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand",
   "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by",
   "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu",
   "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during",
   "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every",
   "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting",
   "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff",
   "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore",
   "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he",
   "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto",
   "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto",
   "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include",
   "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into",
   "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last",
   "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe",
   "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs",
   "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless",
   "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing",
   "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once",
   "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours",
   "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite",
   "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing",
   "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt",
   "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote",
   "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes",
   "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave",
   "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them",
   "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts",
   "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon",
   "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru",
   "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh",
   "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward",
   "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week",
   "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever",
   "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore",
   "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto",
   "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever",
   "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom",
   "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within",
   "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your",
   "yours", "yourself", "yourselves"
   ]
 end
 # Extention of the standard class String with useful function.
 class String
   include RIR
   # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise.
   def is_stopword?
     Stoplist.include?(self.downcase)
   end
   # Do not use.
   # TODO: rewamp. find why this function is here.
   def remove_special_characters
     self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ')
   end
   # Removes all XML-like tags from +self+.
   #
   #   s = "<html><body>test</body></html>"
   #   s.strip_xml_tags!
   #   s                                     #=> "test"
   def strip_xml_tags!
     replace strip_with_pattern /<\/?[^>]*>/
   end
   # Removes all XML-like tags from +self+.
   #
   #   s = "<html><body>test</body></html>"
   #   s.strip_xml_tags                      #=> "test"
   #   s                                     #=> "<html><body>test</body></html>"
   def strip_xml_tags
     dup.strip_xml_tags!
   end
   # Removes all Javascript sources from +self+.
   #
   #   s = "<script type='text/javascript'>
   #         var skin='vector',
   #         stylepath='http://bits.wikimedia.org/skins-1.5'
   #        </script>
   #
   #        test"
   #   s.strip_javascripts!
   #   s                                     #=> "test"
   def strip_javascripts!
     replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m
   end
   # Removes all Javascript sources from +self+.
   #
   #   s = "<script type='text/javascript'>
   #         var skin='vector',
   #         stylepath='http://bits.wikimedia.org/skins-1.5'
   #        </script>
   #
   #        test"
   #   s.strip_javascripts                   #=> "test"
   def strip_javascripts
     dup.strip_javascripts!
   end
   def strip_stylesheets!
   # TODO: rewamp. dunno what is it.
     replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m
   end
   def strip_stylesheets
     dup.strip_stylesheets!
   end
+  # Removes punctuation from +self+.
+  #
+  #   s = "hello, world. how are you?!"
+  #   s.strip_punctuation!
+  #   s                                 # => "hello world how are you"
+  def strip_punctuation!
+    replace strip_with_pattern /[^a-zA-Z0-9\-\s]/
+  end
+  # Removes punctuation from +self+.
+  #
+  #   s = "hello, world. how are you?!"
+  #   s.strip_punctuation               # => "hello world how are you"
+  def strip_punctuation
+    dup.strip_punctuation!
+  end
   # Returns the text values inside all occurences of a XML tag in +self+
   #
   #   s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre"
   #   s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"]
   def extract_xmltags_values(tag_name)
     self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten
   end
   def strip_with_pattern(pattern)
     require 'cgi'
     require 'kconv'
     CGI::unescapeHTML(self.gsub(pattern,"")).toutf8
   end
   private :strip_with_pattern
 end

main.rb

Diff comments View file @ 1453875

 $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib"))
 require 'rir'
 w = RIR::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan")
 p w.entropy("guitar")
 params = RIR::Indri::Parameters.new("path_vers_mon_index")
-p params.rule
 q = RIR::Indri::IndriQuery.new("pouet", "bla", params)
 puts q
+c = RIR::Corpus.new "/home/romain/INEX/BookTrack/corpus/"
+puts c.files.size

test/string_test.rb

Diff comments View file @ 1453875

 #!/usr/bin/env ruby
 require 'test/unit'
 require 'string'
 class TestString < Test::Unit::TestCase
   def test_extract_xml
     s = "four-piece in <a>Indianapolis</a>, <a>Indiana</a> at the Murat Theatre"
     assert_equal(["Indianapolis", "Indiana"],s.extract_xmltags_values('a'))
   end
   def test_stopword
     assert_equal(true, "is".is_stopword?)
     assert_equal(true, "seen".is_stopword?)
     assert_equal(false, "totally".is_stopword?)
     assert_equal(false, "Paris".is_stopword?)
   end
   def test_strip_xml
     assert_equal("testme", "<test>testme</test>".strip_xml_tags)
   end
+  def test_strip_punctuation
+    assert_equal("test test test test   test test", "test, test. .test, ;test !! ? test ...test./".strip_punctuation)
+  end
 end