Commit b506940c3f0fd9a95c10034e4e6b940a9381056e
1 parent
845768f8ac
Exists in
master
possibility to extract only html fields contents when initializing a WebDocument
Showing 1 changed file with 4 additions and 7 deletions Side-by-side Diff
lib/mirimiri/document.rb
... | ... | @@ -80,6 +80,7 @@ |
80 | 80 | # entropy("dillinger escape plan") #=> 0.265862076325102 |
81 | 81 | def entropy(s) |
82 | 82 | en = 0.0 |
83 | + # TODO: count_words as an attribute? | |
83 | 84 | counts = self.count_words |
84 | 85 | |
85 | 86 | s.split.each do |w| |
86 | 87 | |
... | ... | @@ -119,9 +120,10 @@ |
119 | 120 | |
120 | 121 | # WebDocument constructor, the content of the Document is the HTML page |
121 | 122 | # without the tags. |
122 | - def initialize(url) | |
123 | + def initialize(url,only_tags=nil) | |
123 | 124 | @url = url |
124 | - super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags | |
125 | + content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("") | |
126 | + super content.strip_javascripts.strip_stylesheets.strip_xml_tags | |
125 | 127 | end |
126 | 128 | end |
127 | 129 | |
... | ... | @@ -154,11 +156,6 @@ |
154 | 156 | WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? |
155 | 157 | end |
156 | 158 | |
157 | -# def initialize(name) | |
158 | -# title = WikipediaPage.search_wikipedia_titles name | |
159 | -# raise ArgumentError, "No page found" if title.empty? | |
160 | -# super WikipediaPage.get_url title[0] | |
161 | -# end | |
162 | 159 | end |
163 | 160 | end |