Commit b506940c3f0fd9a95c10034e4e6b940a9381056e
1 parent
845768f8ac
Exists in
master
possibility to extract only html fields contents when initializing a WebDocument
Showing 1 changed file with 4 additions and 7 deletions Side-by-side Diff
lib/mirimiri/document.rb
| ... | ... | @@ -80,6 +80,7 @@ |
| 80 | 80 | # entropy("dillinger escape plan") #=> 0.265862076325102 |
| 81 | 81 | def entropy(s) |
| 82 | 82 | en = 0.0 |
| 83 | + # TODO: count_words as an attribute? | |
| 83 | 84 | counts = self.count_words |
| 84 | 85 | |
| 85 | 86 | s.split.each do |w| |
| 86 | 87 | |
| ... | ... | @@ -119,9 +120,10 @@ |
| 119 | 120 | |
| 120 | 121 | # WebDocument constructor, the content of the Document is the HTML page |
| 121 | 122 | # without the tags. |
| 122 | - def initialize(url) | |
| 123 | + def initialize(url,only_tags=nil) | |
| 123 | 124 | @url = url |
| 124 | - super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags | |
| 125 | + content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("") | |
| 126 | + super content.strip_javascripts.strip_stylesheets.strip_xml_tags | |
| 125 | 127 | end |
| 126 | 128 | end |
| 127 | 129 | |
| ... | ... | @@ -154,11 +156,6 @@ |
| 154 | 156 | WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? |
| 155 | 157 | end |
| 156 | 158 | |
| 157 | -# def initialize(name) | |
| 158 | -# title = WikipediaPage.search_wikipedia_titles name | |
| 159 | -# raise ArgumentError, "No page found" if title.empty? | |
| 160 | -# super WikipediaPage.get_url title[0] | |
| 161 | -# end | |
| 162 | 159 | end |
| 163 | 160 | end |