Commit 8f90ef69c4dd38d0e5d298af9f6d7f50b8376bc2

Authored by Romain Deveaud
1 parent b506940c3f
Exists in master

calls to the count_words method of Document are no more allowed

Showing 1 changed file with 4 additions and 5 deletions Inline Diff

lib/mirimiri/document.rb
1 #!/usr/bin/env ruby 1 #!/usr/bin/env ruby
2 2
3 #-- 3 #--
4 # This file is a part of the mirimiri library 4 # This file is a part of the mirimiri library
5 # 5 #
6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> 6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7 # 7 #
8 # This program is free software: you can redistribute it and/or modify 8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by 9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or 10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version. 11 # (at your option) any later version.
12 # 12 #
13 # This program is distributed in the hope that it will be useful, 13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details. 16 # GNU General Public License for more details.
17 # 17 #
18 # You should have received a copy of the GNU General Public License 18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>. 19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #++ 20 #++
21 21
22 22
23 # General module 23 # General module
24 module Mirimiri 24 module Mirimiri
25 25
26 # A Document is a bag of words and is constructed from a string. 26 # A Document is a bag of words and is constructed from a string.
27 class Document 27 class Document
28 attr_reader :words, :doc_content 28 attr_reader :words, :doc_content
29 29
30 # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html 30 # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
31 # and the \\W special escape). 31 # and the \\W special escape).
32 # 32 #
33 # Protected function, only meant to by called at the initialization. 33 # Protected function, only meant to by called at the initialization.
34 def format_words 34 def format_words
35 wo = [] 35 wo = []
36 36
37 @doc_content.split.each do |w| 37 @doc_content.split.each do |w|
38 w.split(/\W/).each do |sw| 38 w.split(/\W/).each do |sw|
39 wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ 39 wo.push(sw.downcase) if sw =~ /[a-zA-Z]/
40 end 40 end
41 end 41 end
42 42
43 wo 43 wo
44 end 44 end
45 45
46 # Returns an Array containing the +n+-grams (words) from the current Document. 46 # Returns an Array containing the +n+-grams (words) from the current Document.
47 # 47 #
48 # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] 48 # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
49 def ngrams(n) 49 def ngrams(n)
50 window = [] 50 window = []
51 ngrams_array = [] 51 ngrams_array = []
52 52
53 @words.each do |w| 53 @words.each do |w|
54 window.push(w) 54 window.push(w)
55 if window.size == n 55 if window.size == n
56 ngrams_array.push window.join(" ") 56 ngrams_array.push window.join(" ")
57 window.delete_at(0) 57 window.delete_at(0)
58 end 58 end
59 end 59 end
60 60
61 ngrams_array.uniq 61 ngrams_array.uniq
62 end 62 end
63 63
64 # Returns a Hash containing the words and their associated counts in the current Document. 64 # Returns a Hash containing the words and their associated counts in the current Document.
65 # 65 #
66 # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } 66 # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
67 def count_words 67 def count_words
68 counts = Hash.new { |h,k| h[k] = 0 } 68 counts = Hash.new { |h,k| h[k] = 0 }
69 @words.each { |w| counts[w] += 1 } 69 @words.each { |w| counts[w] += 1 }
70 70
71 counts 71 counts
72 end 72 end
73 73
74 # Computes the entropy of a given string +s+ inside the document. 74 # Computes the entropy of a given string +s+ inside the document.
75 # 75 #
76 # If the string parameter is composed of many words (i.e. tokens separated 76 # If the string parameter is composed of many words (i.e. tokens separated
77 # by whitespace(s)), it is considered as an ngram. 77 # by whitespace(s)), it is considered as an ngram.
78 # 78 #
79 # entropy("guitar") #=> 0.00432114812727959 79 # entropy("guitar") #=> 0.00432114812727959
80 # entropy("dillinger escape plan") #=> 0.265862076325102 80 # entropy("dillinger escape plan") #=> 0.265862076325102
81 def entropy(s) 81 def entropy(s)
82 en = 0.0 82 en = 0.0
83 # TODO: count_words as an attribute?
84 counts = self.count_words
85 83
86 s.split.each do |w| 84 s.split.each do |w|
87 p_wi = counts[w].to_f/@words.count.to_f 85 p_wi = @count_words[w].to_f/@words.count.to_f
88 en += p_wi*Math.log2(p_wi) 86 en += p_wi*Math.log2(p_wi)
89 end 87 end
90 88
91 en *= -1 89 en *= -1
92 en 90 en
93 end 91 end
94 92
95 # Computes the term frequency of a given *word* +s+. 93 # Computes the term frequency of a given *word* +s+.
96 # 94 #
97 # tf("guitar") #=> 0.000380372765310004 95 # tf("guitar") #=> 0.000380372765310004
98 def tf(s) 96 def tf(s)
99 self.count_words[s].to_f/@words.size.to_f 97 @count_words[s].to_f/@words.size.to_f
100 end 98 end
101 99
102 100
103 def initialize(content="") 101 def initialize(content="")
104 @doc_content = content 102 @doc_content = content
105 @words = format_words 103 @words = format_words
104 @count_words = count_words
106 end 105 end
107 106
108 protected :format_words 107 protected :format_words, :count_words
109 end 108 end
110 109
111 # A WebDocument is a Document with a +url+. 110 # A WebDocument is a Document with a +url+.
112 class WebDocument < Document 111 class WebDocument < Document
113 attr_reader :url 112 attr_reader :url
114 113
115 # Returns the HTML text from the page of a given +url+. 114 # Returns the HTML text from the page of a given +url+.
116 def self.get_content(url) 115 def self.get_content(url)
117 require 'net/http' 116 require 'net/http'
118 Net::HTTP.get(URI.parse(url)) 117 Net::HTTP.get(URI.parse(url))
119 end 118 end
120 119
121 # WebDocument constructor, the content of the Document is the HTML page 120 # WebDocument constructor, the content of the Document is the HTML page
122 # without the tags. 121 # without the tags.
123 def initialize(url,only_tags=nil) 122 def initialize(url,only_tags=nil)
124 @url = url 123 @url = url
125 content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("") 124 content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")
126 super content.strip_javascripts.strip_stylesheets.strip_xml_tags 125 super content.strip_javascripts.strip_stylesheets.strip_xml_tags
127 end 126 end
128 end 127 end
129 128
130 # A WikipediaPage is a WebDocument. 129 # A WikipediaPage is a WebDocument.
131 class WikipediaPage < WebDocument 130 class WikipediaPage < WebDocument
132 require 'rexml/document' 131 require 'rexml/document'
133 require 'net/http' 132 require 'net/http'
134 require 'kconv' 133 require 'kconv'
135 134
136 135
137 def self.search_wikipedia_titles(name) 136 def self.search_wikipedia_titles(name)
138 raise ArgumentError, "Bad encoding", name unless name.isutf8 137 raise ArgumentError, "Bad encoding", name unless name.isutf8
139 138
140 res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search'] 139 res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search']
141 140
142 res.collect { |e| e.attributes['title'] } unless res.nil? 141 res.collect { |e| e.attributes['title'] } unless res.nil?
143 end 142 end
144 143
145 def self.get_url(name) 144 def self.get_url(name)
146 raise ArgumentError, "Bad encoding", name unless name.isutf8 145 raise ArgumentError, "Bad encoding", name unless name.isutf8
147 146
148 atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes 147 atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes
149 148
150 atts['fullurl'] if atts['missing'].nil? 149 atts['fullurl'] if atts['missing'].nil?
151 end 150 end
152 151
153 def self.search_homepage(name) 152 def self.search_homepage(name)
154 title = WikipediaPage.search_wikipedia_titles name 153 title = WikipediaPage.search_wikipedia_titles name
155 154
156 WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? 155 WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?
157 end 156 end
158 157
159 end 158 end
160 end 159 end