Deveaud Romain / mirimiri

Commit b3c02139759e76987368a3d947c49f8c6641ce21

Authored by Romain Deveaud 2012-03-08 19:00:05 +0100

Exists in master

faster computing of successive calls to ngrams(). is_stopword? is now effective …

…for multiword expressions. sdm fix.

Showing 3 changed files with 13 additions and 8 deletions Inline Diff

lib/mirimiri/document.rb
lib/mirimiri/string.rb
main.rb

lib/mirimiri/document.rb

Diff comments View file @ b3c0213

 #!/usr/bin/env ruby
 #--
 # This file is a part of the mirimiri library
 #
 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #++
 # General module
 module Mirimiri
   # A Document is a bag of words and is constructed from a string.
   class Document
     attr_reader :words, :doc_content, :count_words
     # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
     # and the \\W special escape).
     #
     # Protected function, only meant to by called at the initialization.
     def format_words
       wo = []
       @doc_content.split.each do |w|
         w.split(/\W/).each do |sw|
           wo.push(sw.downcase) if sw =~ /[a-zA-Z]/
         end
       end
       wo
     end
     # Returns an Array containing the +n+-grams (words) from the current Document.
     #
     #   ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
     def ngrams(n)
       window       = []
       ngrams_array = []
-      @words.each do |w|
+      if @ngrams[n].nil?
-        window.push(w)
+        @words.each do |w|
-        if window.size == n
+          window.push(w)
-          ngrams_array.push window.join(" ")
+          if window.size == n
-          window.delete_at(0)
+            ngrams_array.push window.join(" ")
+            window.delete_at(0)
+          end
         end
+        @ngrams[n] = ngrams_array
       end
-      ngrams_array
+      @ngrams[n]
     end
     # Returns a Hash containing the words and their associated counts in the current Document.
     #
     #   count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
     def count_words
       counts = Hash.new { |h,k| h[k] = 0 }
       @words.each { |w| counts[w] += 1 }
       counts
     end
     # Old entropy function.
     # TODO: remove.
     def entropy0(s)
       en = 0.0
       s.split.each do |w|
         p_wi = @count_words[w].to_f/@words.count.to_f
         en += p_wi*Math.log2(p_wi)
       end
       en *= -1
       en
     end
     # Computes the entropy of a given string +s+ inside the document.
     #
     # If the string parameter is composed of many words (i.e. tokens separated
     # by whitespace(s)), it is considered as an ngram.
     #
     #   entropy("guitar") #=> 0.014348983965324762
     #   entropy("dillinger escape plan") #=> 0.054976093116768154
     def entropy(s)
       en = 0.0
       size = s.split.size
       if size == 1
         p_wi = @count_words[s].to_f/@words.count.to_f
         en += p_wi*Math.log(p_wi)
       elsif size > 1
         ng_size = ngrams(size)
         p_wi = ng_size.count(s).to_f/ng_size.count.to_f
         en += p_wi*Math.log(p_wi)
       end
       en *= -1
       en
     end
     # Computes the term frequency of a given *word* +s+.
     #
     #   tf("guitar") #=> 0.000380372765310004
     def tf(s)
       @count_words[s].to_f/@words.size.to_f
     end
     def initialize(content="")
       @doc_content = content
       @words = format_words
       @count_words = count_words
+      @ngrams = {}
     end
     protected :format_words, :count_words
   end
   # A WebDocument is a Document with a +url+.
   class WebDocument < Document
     attr_reader :url
     # Returns the HTML text from the page of a given +url+.
     def self.get_content(url)
       require 'net/http'
       Net::HTTP.get(URI.parse(url))
     end
     # WebDocument constructor, the content of the Document is the HTML page
     # without the tags.
     def initialize(url,only_tags=nil)
       require 'sanitize'
       @url = url
       content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")
       super Sanitize.clean(content.unaccent.toutf8.force_encoding("UTF-8"), :remove_contents => ['script'])
     end
   end
   # A WikipediaPage is a WebDocument.
   class WikipediaPage < WebDocument
     require 'rexml/document'
     require 'net/http'
     require 'kconv'
     def self.search_wikipedia_titles(name)
       raise ArgumentError, "Bad encoding", name unless name.isutf8
       res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).unaccent.toutf8).elements['api/query/search']
      res.collect { |e| e.attributes['title'] } unless res.nil?
     end
     def self.get_url(name)
       raise ArgumentError, "Bad encoding", name unless name.isutf8
       atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).unaccent.toutf8).elements['api/query/pages/page'].attributes
       atts['fullurl'] if atts['missing'].nil?
     end
     def self.search_homepage(name)
       title = WikipediaPage.search_wikipedia_titles name
       WikipediaPage.get_url(title[0]) unless title.nil? || title.empty?
     end
     def self.extract_anchors(url)
       self.get_content(url).extract_xmltags_values('p').join(' ').scan(/<a href="(.+?)" title=.*?>(.+?)<\/a>/).delete_if { |a| a[0] =~ /^\/wiki\/.*$/.negated }
     end
   end
   class FreebasePage < WebDocument
     require 'net/http'
     require 'kconv'
     require 'json'
     def self.search_article_ids query,limit
       raise ArgumentError, "Bad encoding", name unless name.isutf8
       JSON.parse(Net::HTTP.get( URI.parse "http://api.freebase.com/api/service/search?query=#{query.gsub(" ","+")}&limit=#{limit}" ))['result'].collect { |a| a['article']['id'] unless a['article'].nil? }.compact
     end
     def self.get_url id
       "http://api.freebase.com/api/trans/raw#{id}"
     end
   end
 end

lib/mirimiri/string.rb

Diff comments View file @ b3c0213

main.rb

Diff comments View file @ b3c0213

 $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib"))
 require 'mirimiri'
+require "benchmark"
 w = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan")
 p w.entropy("dillinger escape plan")
 p w.tf("guitar")
 query = Indri::IndriQuery.new({:query => "dillinger escape plan".sequential_dependence_model, :count => 10}, "-trecFormat=true -printDocuments=true")
 index = Indri::IndriIndex.new "/mnt/disk1/ClueWeb09_English_1noSpam"
 s = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))

1	#!/usr/bin/env ruby	1	#!/usr/bin/env ruby
2		2
3	#--	3	#--
4	# This file is a part of the mirimiri library	4	# This file is a part of the mirimiri library
5	#	5	#
6	# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>	6	# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7	#	7	#
8	# This program is free software: you can redistribute it and/or modify	8	# This program is free software: you can redistribute it and/or modify
9	# it under the terms of the GNU General Public License as published by	9	# it under the terms of the GNU General Public License as published by
10	# the Free Software Foundation, either version 3 of the License, or	10	# the Free Software Foundation, either version 3 of the License, or
11	# (at your option) any later version.	11	# (at your option) any later version.
12	#	12	#
13	# This program is distributed in the hope that it will be useful,	13	# This program is distributed in the hope that it will be useful,
14	# but WITHOUT ANY WARRANTY; without even the implied warranty of	14	# but WITHOUT ANY WARRANTY; without even the implied warranty of
15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	# GNU General Public License for more details.	16	# GNU General Public License for more details.
17	#	17	#
18	# You should have received a copy of the GNU General Public License	18	# You should have received a copy of the GNU General Public License
19	# along with this program. If not, see <http://www.gnu.org/licenses/>.	19	# along with this program. If not, see <http://www.gnu.org/licenses/>.
20	#++	20	#++
21		21
22	module Mirimiri	22	module Mirimiri
23		23
24	# These are the default stopwords provided by Lemur.	24	# These are the default stopwords provided by Lemur.
25	Stoplist = [	25	Stoplist = [
26	"a","about","above","according","across","after","afterwards","again","against",	26	"a","about","above","according","across","after","afterwards","again","against",
27	"albeit","all","almost","alone","along","already","also","although","always","am",	27	"albeit","all","almost","alone","along","already","also","although","always","am",
28	"among","amongst","an","and","another","any","anybody","anyhow","anyone","anything",	28	"among","amongst","an","and","another","any","anybody","anyhow","anyone","anything",
29	"anyway","anywhere","apart","are","around","as","at","av","be","became","because",	29	"anyway","anywhere","apart","are","around","as","at","av","be","became","because",
30	"become","becomes","becoming","been","before","beforehand","behind","being","below",	30	"become","becomes","becoming","been","before","beforehand","behind","being","below",
31	"beside","besides","between","beyond","both","but","by","can","cannot","canst",	31	"beside","besides","between","beyond","both","but","by","can","cannot","canst",
32	"certain","cf","choose","contrariwise","cos","could","cu","day","do","does","doesn't",	32	"certain","cf","choose","contrariwise","cos","could","cu","day","do","does","doesn't",
33	"doing","dost","doth","double","down","dual","during","each","either","else",	33	"doing","dost","doth","double","down","dual","during","each","either","else",
34	"elsewhere","enough","et","etc","even","ever","every","everybody","everyone",	34	"elsewhere","enough","et","etc","even","ever","every","everybody","everyone",
35	"everything","everywhere","except","excepted","excepting","exception","exclude",	35	"everything","everywhere","except","excepted","excepting","exception","exclude",
36	"excluding","exclusive","far","farther","farthest","few","ff","first","for",	36	"excluding","exclusive","far","farther","farthest","few","ff","first","for",
37	"formerly","forth","forward","from","front","further","furthermore","furthest","get",	37	"formerly","forth","forward","from","front","further","furthermore","furthest","get",
38	"go","had","halves","hardly","has","hast","hath","have","he","hence","henceforth",	38	"go","had","halves","hardly","has","hast","hath","have","he","hence","henceforth",
39	"her","here","hereabouts","hereafter","hereby","herein","hereto","hereupon","hers",	39	"her","here","hereabouts","hereafter","hereby","herein","hereto","hereupon","hers",
40	"herself","him","himself","hindmost","his","hither","hitherto","how","however",	40	"herself","him","himself","hindmost","his","hither","hitherto","how","however",