Commit b506940c3f0fd9a95c10034e4e6b940a9381056e

Authored by Romain Deveaud
1 parent 845768f8ac
Exists in master

possibility to extract only html fields contents when initializing a WebDocument

Showing 1 changed file with 4 additions and 7 deletions Side-by-side Diff

lib/mirimiri/document.rb
... ... @@ -80,6 +80,7 @@
80 80 # entropy("dillinger escape plan") #=> 0.265862076325102
81 81 def entropy(s)
82 82 en = 0.0
  83 + # TODO: count_words as an attribute?
83 84 counts = self.count_words
84 85  
85 86 s.split.each do |w|
86 87  
... ... @@ -119,9 +120,10 @@
119 120  
120 121 # WebDocument constructor, the content of the Document is the HTML page
121 122 # without the tags.
122   - def initialize(url)
  123 + def initialize(url,only_tags=nil)
123 124 @url = url
124   - super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags
  125 + content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")
  126 + super content.strip_javascripts.strip_stylesheets.strip_xml_tags
125 127 end
126 128 end
127 129  
... ... @@ -154,11 +156,6 @@
154 156 WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?
155 157 end
156 158  
157   -# def initialize(name)
158   -# title = WikipediaPage.search_wikipedia_titles name
159   -# raise ArgumentError, "No page found" if title.empty?
160   -# super WikipediaPage.get_url title[0]
161   -# end
162 159 end
163 160 end