From b506940c3f0fd9a95c10034e4e6b940a9381056e Mon Sep 17 00:00:00 2001 From: Romain Deveaud Date: Mon, 3 Jan 2011 13:25:02 +0100 Subject: [PATCH] possibility to extract only html fields contents when initializing a WebDocument --- lib/mirimiri/document.rb | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/lib/mirimiri/document.rb b/lib/mirimiri/document.rb index 88fc60d..ac7e4c9 100644 --- a/lib/mirimiri/document.rb +++ b/lib/mirimiri/document.rb @@ -80,6 +80,7 @@ module Mirimiri # entropy("dillinger escape plan") #=> 0.265862076325102 def entropy(s) en = 0.0 + # TODO: count_words as an attribute? counts = self.count_words s.split.each do |w| @@ -119,9 +120,10 @@ module Mirimiri # WebDocument constructor, the content of the Document is the HTML page # without the tags. - def initialize(url) + def initialize(url,only_tags=nil) @url = url - super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags + content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("") + super content.strip_javascripts.strip_stylesheets.strip_xml_tags end end @@ -154,10 +156,5 @@ module Mirimiri WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? end -# def initialize(name) -# title = WikipediaPage.search_wikipedia_titles name -# raise ArgumentError, "No page found" if title.empty? -# super WikipediaPage.get_url title[0] -# end end end -- 1.8.2.3