possibility to extract only html fields contents when initializing a WebDocument

Romain Deveaud
1 parent 845768f8ac
Showing 1 changed file with 4 additions and 7 deletions Side-by-side Diff
lib/mirimiri/document.rb
@@ -80,6 +80,7 @@
     #   entropy("dillinger escape plan") #=> 0.265862076325102
     def entropy(s)
       en = 0.0
+      # TODO: count_words as an attribute?
       counts = self.count_words
  
       s.split.each do |w|
  
@@ -119,9 +120,10 @@
  
     # WebDocument constructor, the content of the Document is the HTML page
     # without the tags.
-    def initialize(url)
+    def initialize(url,only_tags=nil)
       @url = url
-      super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags
+      content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")
+      super content.strip_javascripts.strip_stylesheets.strip_xml_tags
     end
   end
  
@@ -154,11 +156,6 @@
       WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?
     end
  
-#    def initialize(name)
-#      title = WikipediaPage.search_wikipedia_titles name
-#      raise ArgumentError, "No page found" if title.empty? 
-#      super WikipediaPage.get_url title[0]
-#    end
   end
 end
...	...	@@ -80,6 +80,7 @@
80	80	# entropy("dillinger escape plan") #=> 0.265862076325102
81	81	def entropy(s)
82	82	en = 0.0
	83	+ # TODO: count_words as an attribute?
83	84	counts = self.count_words
84	85
85	86	s.split.each do \|w\|
86	87
...	...	@@ -119,9 +120,10 @@
119	120
120	121	# WebDocument constructor, the content of the Document is the HTML page
121	122	# without the tags.
122		- def initialize(url)
	123	+ def initialize(url,only_tags=nil)
123	124	@url = url
124		- super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags
	125	+ content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")
	126	+ super content.strip_javascripts.strip_stylesheets.strip_xml_tags
125	127	end
126	128	end
127	129
...	...	@@ -154,11 +156,6 @@
154	156	WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? \|\| title.empty?
155	157	end
156	158
157		-# def initialize(name)
158		-# title = WikipediaPage.search_wikipedia_titles name
159		-# raise ArgumentError, "No page found" if title.empty?
160		-# super WikipediaPage.get_url title[0]
161		-# end
162	159	end
163	160	end