From b506940c3f0fd9a95c10034e4e6b940a9381056e Mon Sep 17 00:00:00 2001
From: Romain Deveaud <romain.deveaud@gmail.com>
Date: Mon, 3 Jan 2011 13:25:02 +0100
Subject: [PATCH] possibility to extract only html fields contents when
 initializing a WebDocument

---
 lib/mirimiri/document.rb | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/lib/mirimiri/document.rb b/lib/mirimiri/document.rb
index 88fc60d..ac7e4c9 100644
--- a/lib/mirimiri/document.rb
+++ b/lib/mirimiri/document.rb
@@ -80,6 +80,7 @@ module Mirimiri
     #   entropy("dillinger escape plan") #=> 0.265862076325102
     def entropy(s)
       en = 0.0
+      # TODO: count_words as an attribute?
       counts = self.count_words
 
       s.split.each do |w|
@@ -119,9 +120,10 @@ module Mirimiri
 
     # WebDocument constructor, the content of the Document is the HTML page
     # without the tags.
-    def initialize(url)
+    def initialize(url,only_tags=nil)
       @url = url
-      super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags
+      content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")
+      super content.strip_javascripts.strip_stylesheets.strip_xml_tags
     end
   end
 
@@ -154,10 +156,5 @@ module Mirimiri
       WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?
     end
 
-#    def initialize(name)
-#      title = WikipediaPage.search_wikipedia_titles name
-#      raise ArgumentError, "No page found" if title.empty? 
-#      super WikipediaPage.get_url title[0]
-#    end
   end
 end
-- 
1.8.2.3