Commit b506940c3f0fd9a95c10034e4e6b940a9381056e

Authored by Romain Deveaud
1 parent 845768f8ac
Exists in master

possibility to extract only html fields contents when initializing a WebDocument

Showing 1 changed file with 4 additions and 7 deletions Inline Diff

lib/mirimiri/document.rb
1 #!/usr/bin/env ruby 1 #!/usr/bin/env ruby
2 2
3 #-- 3 #--
4 # This file is a part of the mirimiri library 4 # This file is a part of the mirimiri library
5 # 5 #
6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> 6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7 # 7 #
8 # This program is free software: you can redistribute it and/or modify 8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by 9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or 10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version. 11 # (at your option) any later version.
12 # 12 #
13 # This program is distributed in the hope that it will be useful, 13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details. 16 # GNU General Public License for more details.
17 # 17 #
18 # You should have received a copy of the GNU General Public License 18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>. 19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #++ 20 #++
21 21
22 22
23 # General module 23 # General module
24 module Mirimiri 24 module Mirimiri
25 25
26 # A Document is a bag of words and is constructed from a string. 26 # A Document is a bag of words and is constructed from a string.
27 class Document 27 class Document
28 attr_reader :words, :doc_content 28 attr_reader :words, :doc_content
29 29
30 # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html 30 # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
31 # and the \\W special escape). 31 # and the \\W special escape).
32 # 32 #
33 # Protected function, only meant to by called at the initialization. 33 # Protected function, only meant to by called at the initialization.
34 def format_words 34 def format_words
35 wo = [] 35 wo = []
36 36
37 @doc_content.split.each do |w| 37 @doc_content.split.each do |w|
38 w.split(/\W/).each do |sw| 38 w.split(/\W/).each do |sw|
39 wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ 39 wo.push(sw.downcase) if sw =~ /[a-zA-Z]/
40 end 40 end
41 end 41 end
42 42
43 wo 43 wo
44 end 44 end
45 45
46 # Returns an Array containing the +n+-grams (words) from the current Document. 46 # Returns an Array containing the +n+-grams (words) from the current Document.
47 # 47 #
48 # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] 48 # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
49 def ngrams(n) 49 def ngrams(n)
50 window = [] 50 window = []
51 ngrams_array = [] 51 ngrams_array = []
52 52
53 @words.each do |w| 53 @words.each do |w|
54 window.push(w) 54 window.push(w)
55 if window.size == n 55 if window.size == n
56 ngrams_array.push window.join(" ") 56 ngrams_array.push window.join(" ")
57 window.delete_at(0) 57 window.delete_at(0)
58 end 58 end
59 end 59 end
60 60
61 ngrams_array.uniq 61 ngrams_array.uniq
62 end 62 end
63 63
64 # Returns a Hash containing the words and their associated counts in the current Document. 64 # Returns a Hash containing the words and their associated counts in the current Document.
65 # 65 #
66 # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } 66 # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
67 def count_words 67 def count_words
68 counts = Hash.new { |h,k| h[k] = 0 } 68 counts = Hash.new { |h,k| h[k] = 0 }
69 @words.each { |w| counts[w] += 1 } 69 @words.each { |w| counts[w] += 1 }
70 70
71 counts 71 counts
72 end 72 end
73 73
74 # Computes the entropy of a given string +s+ inside the document. 74 # Computes the entropy of a given string +s+ inside the document.
75 # 75 #
76 # If the string parameter is composed of many words (i.e. tokens separated 76 # If the string parameter is composed of many words (i.e. tokens separated
77 # by whitespace(s)), it is considered as an ngram. 77 # by whitespace(s)), it is considered as an ngram.
78 # 78 #
79 # entropy("guitar") #=> 0.00432114812727959 79 # entropy("guitar") #=> 0.00432114812727959
80 # entropy("dillinger escape plan") #=> 0.265862076325102 80 # entropy("dillinger escape plan") #=> 0.265862076325102
81 def entropy(s) 81 def entropy(s)
82 en = 0.0 82 en = 0.0
83 # TODO: count_words as an attribute?
83 counts = self.count_words 84 counts = self.count_words
84 85
85 s.split.each do |w| 86 s.split.each do |w|
86 p_wi = counts[w].to_f/@words.count.to_f 87 p_wi = counts[w].to_f/@words.count.to_f
87 en += p_wi*Math.log2(p_wi) 88 en += p_wi*Math.log2(p_wi)
88 end 89 end
89 90
90 en *= -1 91 en *= -1
91 en 92 en
92 end 93 end
93 94
94 # Computes the term frequency of a given *word* +s+. 95 # Computes the term frequency of a given *word* +s+.
95 # 96 #
96 # tf("guitar") #=> 0.000380372765310004 97 # tf("guitar") #=> 0.000380372765310004
97 def tf(s) 98 def tf(s)
98 self.count_words[s].to_f/@words.size.to_f 99 self.count_words[s].to_f/@words.size.to_f
99 end 100 end
100 101
101 102
102 def initialize(content="") 103 def initialize(content="")
103 @doc_content = content 104 @doc_content = content
104 @words = format_words 105 @words = format_words
105 end 106 end
106 107
107 protected :format_words 108 protected :format_words
108 end 109 end
109 110
110 # A WebDocument is a Document with a +url+. 111 # A WebDocument is a Document with a +url+.
111 class WebDocument < Document 112 class WebDocument < Document
112 attr_reader :url 113 attr_reader :url
113 114
114 # Returns the HTML text from the page of a given +url+. 115 # Returns the HTML text from the page of a given +url+.
115 def self.get_content(url) 116 def self.get_content(url)
116 require 'net/http' 117 require 'net/http'
117 Net::HTTP.get(URI.parse(url)) 118 Net::HTTP.get(URI.parse(url))
118 end 119 end
119 120
120 # WebDocument constructor, the content of the Document is the HTML page 121 # WebDocument constructor, the content of the Document is the HTML page
121 # without the tags. 122 # without the tags.
122 def initialize(url) 123 def initialize(url,only_tags=nil)
123 @url = url 124 @url = url
124 super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags 125 content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")
126 super content.strip_javascripts.strip_stylesheets.strip_xml_tags
125 end 127 end
126 end 128 end
127 129
128 # A WikipediaPage is a WebDocument. 130 # A WikipediaPage is a WebDocument.
129 class WikipediaPage < WebDocument 131 class WikipediaPage < WebDocument
130 require 'rexml/document' 132 require 'rexml/document'
131 require 'net/http' 133 require 'net/http'
132 require 'kconv' 134 require 'kconv'
133 135
134 136
135 def self.search_wikipedia_titles(name) 137 def self.search_wikipedia_titles(name)
136 raise ArgumentError, "Bad encoding", name unless name.isutf8 138 raise ArgumentError, "Bad encoding", name unless name.isutf8
137 139
138 res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search'] 140 res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search']
139 141
140 res.collect { |e| e.attributes['title'] } unless res.nil? 142 res.collect { |e| e.attributes['title'] } unless res.nil?
141 end 143 end
142 144
143 def self.get_url(name) 145 def self.get_url(name)
144 raise ArgumentError, "Bad encoding", name unless name.isutf8 146 raise ArgumentError, "Bad encoding", name unless name.isutf8
145 147
146 atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes 148 atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes
147 149
148 atts['fullurl'] if atts['missing'].nil? 150 atts['fullurl'] if atts['missing'].nil?
149 end 151 end
150 152
151 def self.search_homepage(name) 153 def self.search_homepage(name)
152 title = WikipediaPage.search_wikipedia_titles name 154 title = WikipediaPage.search_wikipedia_titles name
153 155
154 WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? 156 WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?
155 end 157 end
156 158
157 # def initialize(name)
158 # title = WikipediaPage.search_wikipedia_titles name
159 # raise ArgumentError, "No page found" if title.empty?
160 # super WikipediaPage.get_url title[0]
161 # end
162 end 159 end