Deveaud Romain / mirimiri

Browse Code »

Commit aa386f5530e2fcc8866a8c83ded03b4f672c4d18

Authored by Romain Deveaud 2012-03-02 16:23:51 +0100

1 parent b768fe9411

Exists in master

changes in query, new index class

Showing 4 changed files with 81 additions and 4 deletions Inline Diff

lib/mirimiri/document.rb
lib/mirimiri/index.rb
lib/mirimiri/query.rb
lib/mirimiri/string.rb

lib/mirimiri/document.rb

Diff comments View file @ aa386f5

 #!/usr/bin/env ruby
 #--
 # This file is a part of the mirimiri library
 #
 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #++
 # General module
 module Mirimiri
   # A Document is a bag of words and is constructed from a string.
   class Document
     attr_reader :words, :doc_content, :count_words
     # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
     # and the \\W special escape).
     #
     # Protected function, only meant to by called at the initialization.
     def format_words
       wo = []
       @doc_content.split.each do |w|
         w.split(/\W/).each do |sw|
           wo.push(sw.downcase) if sw =~ /[a-zA-Z]/
         end
       end
       wo
     end
     # Returns an Array containing the +n+-grams (words) from the current Document.
     #
     #   ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
     def ngrams(n)
       window       = []
       ngrams_array = []
       @words.each do |w|
         window.push(w)
         if window.size == n
           ngrams_array.push window.join(" ")
           window.delete_at(0)
         end
       end
       ngrams_array.uniq
     end
     # Returns a Hash containing the words and their associated counts in the current Document.
     #
     #   count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
     def count_words
       counts = Hash.new { |h,k| h[k] = 0 }
       @words.each { |w| counts[w] += 1 }
       counts
     end
     # Computes the entropy of a given string +s+ inside the document.
     #
     # If the string parameter is composed of many words (i.e. tokens separated
     # by whitespace(s)), it is considered as an ngram.
     #
     #   entropy("guitar") #=> 0.00432114812727959
     #   entropy("dillinger escape plan") #=> 0.265862076325102
     def entropy(s)
       en = 0.0
       s.split.each do |w|
         p_wi = @count_words[w].to_f/@words.count.to_f
         en += p_wi*Math.log2(p_wi)
       end
       en *= -1
       en
     end
     # Computes the term frequency of a given *word* +s+.
     #
     #   tf("guitar") #=> 0.000380372765310004
     def tf(s)
       @count_words[s].to_f/@words.size.to_f
     end
     def initialize(content="")
       @doc_content = content
       @words = format_words
       @count_words = count_words
     end
     protected :format_words, :count_words
   end
   # A WebDocument is a Document with a +url+.
   class WebDocument < Document
     attr_reader :url
     # Returns the HTML text from the page of a given +url+.
     def self.get_content(url)
       require 'net/http'
       Net::HTTP.get(URI.parse(url))
     end
     # WebDocument constructor, the content of the Document is the HTML page
     # without the tags.
     def initialize(url,only_tags=nil)
+      require 'sanitize'
       @url = url
       content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")
-      super content.strip_javascripts.strip_xml_tags
+      super Sanitize.clean(content.unaccent.toutf8.force_encoding("UTF-8"), :remove_contents => ['script'])
     end
   end
   # A WikipediaPage is a WebDocument.
   class WikipediaPage < WebDocument
     require 'rexml/document'
     require 'net/http'
     require 'kconv'
     def self.search_wikipedia_titles(name)
       raise ArgumentError, "Bad encoding", name unless name.isutf8
       res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).unaccent.toutf8).elements['api/query/search']
      res.collect { |e| e.attributes['title'] } unless res.nil?
     end
     def self.get_url(name)
       raise ArgumentError, "Bad encoding", name unless name.isutf8
       atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).unaccent.toutf8).elements['api/query/pages/page'].attributes
       atts['fullurl'] if atts['missing'].nil?
     end
     def self.search_homepage(name)
       title = WikipediaPage.search_wikipedia_titles name
       WikipediaPage.get_url(title[0]) unless title.nil? || title.empty?
     end
+    def self.extract_anchors(url)
+      self.get_content(url).extract_xmltags_values('p').join(' ').scan(/<a href="(.+?)" title=.*?>(.+?)<\/a>/).delete_if { |a| a[0] =~ /^\/wiki\/.*$/.negated }
+    end
+  end
+  class FreebasePage < WebDocument
+    require 'net/http'
+    require 'kconv'
+    require 'json'
+    def self.search_article_ids query,limit
+      raise ArgumentError, "Bad encoding", name unless name.isutf8
+      JSON.parse(Net::HTTP.get( URI.parse "http://api.freebase.com/api/service/search?query=#{query.gsub(" ","+")}&limit=#{limit}" ))['result'].collect { |a| a['article']['id'] unless a['article'].nil? }.compact
+    end
+    def self.get_url id
+      "http://api.freebase.com/api/trans/raw#{id}"
+    end
   end
 end

lib/mirimiri/index.rb

Diff comments View file @ aa386f5

File was created	1	#!/usr/bin/env ruby
	2
	3	#--
	4	# This file is a part of the mirimiri library
	5	#
	6	# Copyright (C) 2010-2012 Romain Deveaud <romain.deveaud@gmail.com>
	7	#
	8	# This program is free software: you can redistribute it and/or modify
	9	# it under the terms of the GNU General Public License as published by
	10	# the Free Software Foundation, either version 3 of the License, or
	11	# (at your option) any later version.
	12	#
	13	# This program is distributed in the hope that it will be useful,
	14	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	# GNU General Public License for more details.
	17	#
	18	# You should have received a copy of the GNU General Public License
	19	# along with this program. If not, see <http://www.gnu.org/licenses/>.
	20	#++
	21
	22	class Index
	23	end
	24
	25	module Indri
	26
	27	class IndriIndex
	28
	29	def exec indriquery
	30	raise ArgumentError, 'Argument is not an IndriQuery' unless indriquery.is_a? Indri::IndriQuery
	31
	32	query = "IndriRunQuery -query#{indriquery.query} -index=#{@path}"
	33
	34	query += " -count=#{indriquery.count}" unless indriquery.count.nil?
	35	query += " -rule=method:#{indriquery.sm_method},#{indriquery.sm_param}:#{indriquery.sm_value}" unless indriquery.sm_method.nil?
	36	query += " #{indriquery.args}" unless indriquery.args.nil?
	37	end
	38	end
	39	end
	40

lib/mirimiri/query.rb

Diff comments View file @ aa386f5

 #!/usr/bin/env ruby
 #--
 # This file is a part of the mirimiri library
 #
 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #++
 class Query
 end
 module Indri
   class Parameters
     attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline
-    def initialize(corpus,count="1000",mem="1g",offset="1",run_id="default",print_query=false,print_docs=false)
+    def initialize(corpus,count="1000",mem="1g",threads="1",offset="1",run_id="default",print_query=false,print_docs=false)
       @index_path  = corpus
       @memory      = mem
       @count       = count
+      @threads     = threads
       @offset      = offset
       @run_id      = run_id
       @print_query = print_query ? "true" : "false"
       @print_docs  = print_docs  ? "true" : "false"
     end
     def to_s
       h = "<memory>#{@memory}</memory>\n"
       h += "<index>#{@index_path}</index>\n"
       h += "<count>#{@count}</count>\n"
+      h += "<threads>#{@threads}</threads>\n"
       unless @baseline.nil?
         h += "<baseline>#{@baseline}</baseline>\n"
       else
         h += "<rule>#{@rule}</rule>\n"
       end
       h += "<trecFormat>true</trecFormat>\n"
       h += "<queryOffset>#{@offset}</queryOffset>\n"
       h += "<runID>#{@run_id}</runID>\n"
       h += "<printQuery>#{@print_query}</printQuery>\n"
       h += "<printDocuments>#{@print_docs}</printDocuments>\n"
       h
     end
   end
-  class IndriQuery < Query
+  class IndriQueryOld < Query
     attr_accessor :id, :query, :rule
     def initialize(id,query)
       @id     = id
       @query  = query
     end
     def to_s
       h = "<query>\n"
       h += "<number>#{@id}</number>\n"
       h += "<text>#{@query}</text>\n"
       h += "</query>\n"
       h
     end
     def exec params
       `IndriRunQuery -query='#{@query}' -index=#{params.index_path} -count=#{params.count} -rule=method:dirichlet,mu:2500 -trecFormat`
+    end
+  end
+  class IndriQuery < Query
+    attr_accessor :query, :count, :sm_method, :sm_param, :sm_value, :args
+    def initialize atts={},args=nil
+      raise ArgumentError, 'Argument 1 must be a Hash' unless args.is_a? Hash
+      atts.each do |k,v|
+        instance_variable_set("@#{k}", v) unless v.nil?
+      end
+      raise ArgumentError, 'Argument 2 must be a String' unless args.is_a? String
+      @args = args
     end
   end
   class IndriQueries
     attr_accessor :params, :queries
     def initialize(params,*queries)
       @queries = queries
       @params = params
       # Here we set the default retrieval model as Language Modeling
       # with a Dirichlet smoothing at 2500.
       # TODO: maybe a Rule class...
       @params.rule  = 'method:dirichlet,mu:2500' if @params.rule.nil?
     end
     def to_s
       h = "<parameters>\n"
       h += @params.to_s
       h += @queries.collect { |q| q.to_s }.join ""
       h += "</parameters>"
       h
     end
   end
 end

lib/mirimiri/string.rb

Diff comments View file @ aa386f5

1	#!/usr/bin/env ruby	1	#!/usr/bin/env ruby
2		2
3	#--	3	#--
4	# This file is a part of the mirimiri library	4	# This file is a part of the mirimiri library
5	#	5	#
6	# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>	6	# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7	#	7	#
8	# This program is free software: you can redistribute it and/or modify	8	# This program is free software: you can redistribute it and/or modify
9	# it under the terms of the GNU General Public License as published by	9	# it under the terms of the GNU General Public License as published by
10	# the Free Software Foundation, either version 3 of the License, or	10	# the Free Software Foundation, either version 3 of the License, or
11	# (at your option) any later version.	11	# (at your option) any later version.
12	#	12	#
13	# This program is distributed in the hope that it will be useful,	13	# This program is distributed in the hope that it will be useful,
14	# but WITHOUT ANY WARRANTY; without even the implied warranty of	14	# but WITHOUT ANY WARRANTY; without even the implied warranty of
15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	# GNU General Public License for more details.	16	# GNU General Public License for more details.
17	#	17	#
18	# You should have received a copy of the GNU General Public License	18	# You should have received a copy of the GNU General Public License
19	# along with this program. If not, see <http://www.gnu.org/licenses/>.	19	# along with this program. If not, see <http://www.gnu.org/licenses/>.
20	#++	20	#++
21		21
22	module Mirimiri	22	module Mirimiri
23		23
24	# These are the default stopwords provided by Lemur.	24	# These are the default stopwords provided by Lemur.
25	Stoplist = [	25	Stoplist = [
26	"a","about","above","according","across","after","afterwards","again","against",	26	"a","about","above","according","across","after","afterwards","again","against",
27	"albeit","all","almost","alone","along","already","also","although","always","am",	27	"albeit","all","almost","alone","along","already","also","although","always","am",
28	"among","amongst","an","and","another","any","anybody","anyhow","anyone","anything",	28	"among","amongst","an","and","another","any","anybody","anyhow","anyone","anything",
29	"anyway","anywhere","apart","are","around","as","at","av","be","became","because",	29	"anyway","anywhere","apart","are","around","as","at","av","be","became","because",
30	"become","becomes","becoming","been","before","beforehand","behind","being","below",	30	"become","becomes","becoming","been","before","beforehand","behind","being","below",
31	"beside","besides","between","beyond","both","but","by","can","cannot","canst",	31	"beside","besides","between","beyond","both","but","by","can","cannot","canst",
32	"certain","cf","choose","contrariwise","cos","could","cu","day","do","does","doesn't",	32	"certain","cf","choose","contrariwise","cos","could","cu","day","do","does","doesn't",
33	"doing","dost","doth","double","down","dual","during","each","either","else",	33	"doing","dost","doth","double","down","dual","during","each","either","else",
34	"elsewhere","enough","et","etc","even","ever","every","everybody","everyone",	34	"elsewhere","enough","et","etc","even","ever","every","everybody","everyone",
35	"everything","everywhere","except","excepted","excepting","exception","exclude",	35	"everything","everywhere","except","excepted","excepting","exception","exclude",
36	"excluding","exclusive","far","farther","farthest","few","ff","first","for",	36	"excluding","exclusive","far","farther","farthest","few","ff","first","for",
37	"formerly","forth","forward","from","front","further","furthermore","furthest","get",	37	"formerly","forth","forward","from","front","further","furthermore","furthest","get",
38	"go","had","halves","hardly","has","hast","hath","have","he","hence","henceforth",	38	"go","had","halves","hardly","has","hast","hath","have","he","hence","henceforth",
39	"her","here","hereabouts","hereafter","hereby","herein","hereto","hereupon","hers",	39	"her","here","hereabouts","hereafter","hereby","herein","hereto","hereupon","hers",
40	"herself","him","himself","hindmost","his","hither","hitherto","how","however",	40	"herself","him","himself","hindmost","his","hither","hitherto","how","however",