Commit b506940c3f0fd9a95c10034e4e6b940a9381056e
1 parent
845768f8ac
Exists in
master
possibility to extract only html fields contents when initializing a WebDocument
Showing 1 changed file with 4 additions and 7 deletions Inline Diff
lib/mirimiri/document.rb
1 | #!/usr/bin/env ruby | 1 | #!/usr/bin/env ruby |
2 | 2 | ||
3 | #-- | 3 | #-- |
4 | # This file is a part of the mirimiri library | 4 | # This file is a part of the mirimiri library |
5 | # | 5 | # |
6 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | 6 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> |
7 | # | 7 | # |
8 | # This program is free software: you can redistribute it and/or modify | 8 | # This program is free software: you can redistribute it and/or modify |
9 | # it under the terms of the GNU General Public License as published by | 9 | # it under the terms of the GNU General Public License as published by |
10 | # the Free Software Foundation, either version 3 of the License, or | 10 | # the Free Software Foundation, either version 3 of the License, or |
11 | # (at your option) any later version. | 11 | # (at your option) any later version. |
12 | # | 12 | # |
13 | # This program is distributed in the hope that it will be useful, | 13 | # This program is distributed in the hope that it will be useful, |
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
16 | # GNU General Public License for more details. | 16 | # GNU General Public License for more details. |
17 | # | 17 | # |
18 | # You should have received a copy of the GNU General Public License | 18 | # You should have received a copy of the GNU General Public License |
19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | 19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
20 | #++ | 20 | #++ |
21 | 21 | ||
22 | 22 | ||
23 | # General module | 23 | # General module |
24 | module Mirimiri | 24 | module Mirimiri |
25 | 25 | ||
26 | # A Document is a bag of words and is constructed from a string. | 26 | # A Document is a bag of words and is constructed from a string. |
27 | class Document | 27 | class Document |
28 | attr_reader :words, :doc_content | 28 | attr_reader :words, :doc_content |
29 | 29 | ||
30 | # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html | 30 | # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html |
31 | # and the \\W special escape). | 31 | # and the \\W special escape). |
32 | # | 32 | # |
33 | # Protected function, only meant to by called at the initialization. | 33 | # Protected function, only meant to by called at the initialization. |
34 | def format_words | 34 | def format_words |
35 | wo = [] | 35 | wo = [] |
36 | 36 | ||
37 | @doc_content.split.each do |w| | 37 | @doc_content.split.each do |w| |
38 | w.split(/\W/).each do |sw| | 38 | w.split(/\W/).each do |sw| |
39 | wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ | 39 | wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ |
40 | end | 40 | end |
41 | end | 41 | end |
42 | 42 | ||
43 | wo | 43 | wo |
44 | end | 44 | end |
45 | 45 | ||
46 | # Returns an Array containing the +n+-grams (words) from the current Document. | 46 | # Returns an Array containing the +n+-grams (words) from the current Document. |
47 | # | 47 | # |
48 | # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] | 48 | # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] |
49 | def ngrams(n) | 49 | def ngrams(n) |
50 | window = [] | 50 | window = [] |
51 | ngrams_array = [] | 51 | ngrams_array = [] |
52 | 52 | ||
53 | @words.each do |w| | 53 | @words.each do |w| |
54 | window.push(w) | 54 | window.push(w) |
55 | if window.size == n | 55 | if window.size == n |
56 | ngrams_array.push window.join(" ") | 56 | ngrams_array.push window.join(" ") |
57 | window.delete_at(0) | 57 | window.delete_at(0) |
58 | end | 58 | end |
59 | end | 59 | end |
60 | 60 | ||
61 | ngrams_array.uniq | 61 | ngrams_array.uniq |
62 | end | 62 | end |
63 | 63 | ||
64 | # Returns a Hash containing the words and their associated counts in the current Document. | 64 | # Returns a Hash containing the words and their associated counts in the current Document. |
65 | # | 65 | # |
66 | # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } | 66 | # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } |
67 | def count_words | 67 | def count_words |
68 | counts = Hash.new { |h,k| h[k] = 0 } | 68 | counts = Hash.new { |h,k| h[k] = 0 } |
69 | @words.each { |w| counts[w] += 1 } | 69 | @words.each { |w| counts[w] += 1 } |
70 | 70 | ||
71 | counts | 71 | counts |
72 | end | 72 | end |
73 | 73 | ||
74 | # Computes the entropy of a given string +s+ inside the document. | 74 | # Computes the entropy of a given string +s+ inside the document. |
75 | # | 75 | # |
76 | # If the string parameter is composed of many words (i.e. tokens separated | 76 | # If the string parameter is composed of many words (i.e. tokens separated |
77 | # by whitespace(s)), it is considered as an ngram. | 77 | # by whitespace(s)), it is considered as an ngram. |
78 | # | 78 | # |
79 | # entropy("guitar") #=> 0.00432114812727959 | 79 | # entropy("guitar") #=> 0.00432114812727959 |
80 | # entropy("dillinger escape plan") #=> 0.265862076325102 | 80 | # entropy("dillinger escape plan") #=> 0.265862076325102 |
81 | def entropy(s) | 81 | def entropy(s) |
82 | en = 0.0 | 82 | en = 0.0 |
83 | # TODO: count_words as an attribute? | ||
83 | counts = self.count_words | 84 | counts = self.count_words |
84 | 85 | ||
85 | s.split.each do |w| | 86 | s.split.each do |w| |
86 | p_wi = counts[w].to_f/@words.count.to_f | 87 | p_wi = counts[w].to_f/@words.count.to_f |
87 | en += p_wi*Math.log2(p_wi) | 88 | en += p_wi*Math.log2(p_wi) |
88 | end | 89 | end |
89 | 90 | ||
90 | en *= -1 | 91 | en *= -1 |
91 | en | 92 | en |
92 | end | 93 | end |
93 | 94 | ||
94 | # Computes the term frequency of a given *word* +s+. | 95 | # Computes the term frequency of a given *word* +s+. |
95 | # | 96 | # |
96 | # tf("guitar") #=> 0.000380372765310004 | 97 | # tf("guitar") #=> 0.000380372765310004 |
97 | def tf(s) | 98 | def tf(s) |
98 | self.count_words[s].to_f/@words.size.to_f | 99 | self.count_words[s].to_f/@words.size.to_f |
99 | end | 100 | end |
100 | 101 | ||
101 | 102 | ||
102 | def initialize(content="") | 103 | def initialize(content="") |
103 | @doc_content = content | 104 | @doc_content = content |
104 | @words = format_words | 105 | @words = format_words |
105 | end | 106 | end |
106 | 107 | ||
107 | protected :format_words | 108 | protected :format_words |
108 | end | 109 | end |
109 | 110 | ||
110 | # A WebDocument is a Document with a +url+. | 111 | # A WebDocument is a Document with a +url+. |
111 | class WebDocument < Document | 112 | class WebDocument < Document |
112 | attr_reader :url | 113 | attr_reader :url |
113 | 114 | ||
114 | # Returns the HTML text from the page of a given +url+. | 115 | # Returns the HTML text from the page of a given +url+. |
115 | def self.get_content(url) | 116 | def self.get_content(url) |
116 | require 'net/http' | 117 | require 'net/http' |
117 | Net::HTTP.get(URI.parse(url)) | 118 | Net::HTTP.get(URI.parse(url)) |
118 | end | 119 | end |
119 | 120 | ||
120 | # WebDocument constructor, the content of the Document is the HTML page | 121 | # WebDocument constructor, the content of the Document is the HTML page |
121 | # without the tags. | 122 | # without the tags. |
122 | def initialize(url) | 123 | def initialize(url,only_tags=nil) |
123 | @url = url | 124 | @url = url |
124 | super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags | 125 | content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("") |
126 | super content.strip_javascripts.strip_stylesheets.strip_xml_tags | ||
125 | end | 127 | end |
126 | end | 128 | end |
127 | 129 | ||
128 | # A WikipediaPage is a WebDocument. | 130 | # A WikipediaPage is a WebDocument. |
129 | class WikipediaPage < WebDocument | 131 | class WikipediaPage < WebDocument |
130 | require 'rexml/document' | 132 | require 'rexml/document' |
131 | require 'net/http' | 133 | require 'net/http' |
132 | require 'kconv' | 134 | require 'kconv' |
133 | 135 | ||
134 | 136 | ||
135 | def self.search_wikipedia_titles(name) | 137 | def self.search_wikipedia_titles(name) |
136 | raise ArgumentError, "Bad encoding", name unless name.isutf8 | 138 | raise ArgumentError, "Bad encoding", name unless name.isutf8 |
137 | 139 | ||
138 | res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search'] | 140 | res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search'] |
139 | 141 | ||
140 | res.collect { |e| e.attributes['title'] } unless res.nil? | 142 | res.collect { |e| e.attributes['title'] } unless res.nil? |
141 | end | 143 | end |
142 | 144 | ||
143 | def self.get_url(name) | 145 | def self.get_url(name) |
144 | raise ArgumentError, "Bad encoding", name unless name.isutf8 | 146 | raise ArgumentError, "Bad encoding", name unless name.isutf8 |
145 | 147 | ||
146 | atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes | 148 | atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes |
147 | 149 | ||
148 | atts['fullurl'] if atts['missing'].nil? | 150 | atts['fullurl'] if atts['missing'].nil? |
149 | end | 151 | end |
150 | 152 | ||
151 | def self.search_homepage(name) | 153 | def self.search_homepage(name) |
152 | title = WikipediaPage.search_wikipedia_titles name | 154 | title = WikipediaPage.search_wikipedia_titles name |
153 | 155 | ||
154 | WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? | 156 | WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? |
155 | end | 157 | end |
156 | 158 | ||
157 | # def initialize(name) | ||
158 | # title = WikipediaPage.search_wikipedia_titles name | ||
159 | # raise ArgumentError, "No page found" if title.empty? | ||
160 | # super WikipediaPage.get_url title[0] | ||
161 | # end | ||
162 | end | 159 | end |