Commit b55f47b3852621a367b08b37a5da99fe0b489ea4

Authored by Romain Deveaud
1 parent e267264ee3
Exists in master

resolving encoding problems

Showing 1 changed file with 5 additions and 5 deletions Inline Diff

lib/mirimiri/document.rb
1 #!/usr/bin/env ruby 1 #!/usr/bin/env ruby
2 2
3 #-- 3 #--
4 # This file is a part of the mirimiri library 4 # This file is a part of the mirimiri library
5 # 5 #
6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> 6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7 # 7 #
8 # This program is free software: you can redistribute it and/or modify 8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by 9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or 10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version. 11 # (at your option) any later version.
12 # 12 #
13 # This program is distributed in the hope that it will be useful, 13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of 14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details. 16 # GNU General Public License for more details.
17 # 17 #
18 # You should have received a copy of the GNU General Public License 18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>. 19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #++ 20 #++
21 21
22 22
23 # General module 23 # General module
24 module Mirimiri 24 module Mirimiri
25 25
26 # A Document is a bag of words and is constructed from a string. 26 # A Document is a bag of words and is constructed from a string.
27 class Document 27 class Document
28 attr_reader :words, :doc_content, :count_words 28 attr_reader :words, :doc_content, :count_words
29 29
30 # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html 30 # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
31 # and the \\W special escape). 31 # and the \\W special escape).
32 # 32 #
33 # Protected function, only meant to by called at the initialization. 33 # Protected function, only meant to by called at the initialization.
34 def format_words 34 def format_words
35 wo = [] 35 wo = []
36 36
37 @doc_content.split.each do |w| 37 @doc_content.split.each do |w|
38 w.split(/\W/).each do |sw| 38 w.split(/\W/).each do |sw|
39 wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ 39 wo.push(sw.downcase) if sw =~ /[a-zA-Z]/
40 end 40 end
41 end 41 end
42 42
43 wo 43 wo
44 end 44 end
45 45
46 # Returns an Array containing the +n+-grams (words) from the current Document. 46 # Returns an Array containing the +n+-grams (words) from the current Document.
47 # 47 #
48 # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] 48 # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
49 def ngrams(n) 49 def ngrams(n)
50 window = [] 50 window = []
51 ngrams_array = [] 51 ngrams_array = []
52 52
53 @words.each do |w| 53 @words.each do |w|
54 window.push(w) 54 window.push(w)
55 if window.size == n 55 if window.size == n
56 ngrams_array.push window.join(" ") 56 ngrams_array.push window.join(" ")
57 window.delete_at(0) 57 window.delete_at(0)
58 end 58 end
59 end 59 end
60 60
61 ngrams_array.uniq 61 ngrams_array.uniq
62 end 62 end
63 63
64 # Returns a Hash containing the words and their associated counts in the current Document. 64 # Returns a Hash containing the words and their associated counts in the current Document.
65 # 65 #
66 # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } 66 # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
67 def count_words 67 def count_words
68 counts = Hash.new { |h,k| h[k] = 0 } 68 counts = Hash.new { |h,k| h[k] = 0 }
69 @words.each { |w| counts[w] += 1 } 69 @words.each { |w| counts[w] += 1 }
70 70
71 counts 71 counts
72 end 72 end
73 73
74 # Computes the entropy of a given string +s+ inside the document. 74 # Computes the entropy of a given string +s+ inside the document.
75 # 75 #
76 # If the string parameter is composed of many words (i.e. tokens separated 76 # If the string parameter is composed of many words (i.e. tokens separated
77 # by whitespace(s)), it is considered as an ngram. 77 # by whitespace(s)), it is considered as an ngram.
78 # 78 #
79 # entropy("guitar") #=> 0.00432114812727959 79 # entropy("guitar") #=> 0.00432114812727959
80 # entropy("dillinger escape plan") #=> 0.265862076325102 80 # entropy("dillinger escape plan") #=> 0.265862076325102
81 def entropy(s) 81 def entropy(s)
82 en = 0.0 82 en = 0.0
83 83
84 s.split.each do |w| 84 s.split.each do |w|
85 p_wi = @count_words[w].to_f/@words.count.to_f 85 p_wi = @count_words[w].to_f/@words.count.to_f
86 en += p_wi*Math.log2(p_wi) 86 en += p_wi*Math.log2(p_wi)
87 end 87 end
88 88
89 en *= -1 89 en *= -1
90 en 90 en
91 end 91 end
92 92
93 # Computes the term frequency of a given *word* +s+. 93 # Computes the term frequency of a given *word* +s+.
94 # 94 #
95 # tf("guitar") #=> 0.000380372765310004 95 # tf("guitar") #=> 0.000380372765310004
96 def tf(s) 96 def tf(s)
97 @count_words[s].to_f/@words.size.to_f 97 @count_words[s].to_f/@words.size.to_f
98 end 98 end
99 99
100 100
101 def initialize(content="") 101 def initialize(content="")
102 @doc_content = content 102 @doc_content = content
103 @words = format_words 103 @words = format_words
104 @count_words = count_words 104 @count_words = count_words
105 end 105 end
106 106
107 protected :format_words, :count_words 107 protected :format_words, :count_words
108 end 108 end
109 109
110 # A WebDocument is a Document with a +url+. 110 # A WebDocument is a Document with a +url+.
111 class WebDocument < Document 111 class WebDocument < Document
112 attr_reader :url 112 attr_reader :url
113 113
114 # Returns the HTML text from the page of a given +url+. 114 # Returns the HTML text from the page of a given +url+.
115 def self.get_content(url) 115 def self.get_content(url)
116 require 'net/http' 116 require 'net/http'
117 Net::HTTP.get(URI.parse(url)) 117 Net::HTTP.get(URI.parse(url))
118 end 118 end
119 119
120 # WebDocument constructor, the content of the Document is the HTML page 120 # WebDocument constructor, the content of the Document is the HTML page
121 # without the tags. 121 # without the tags.
122 def initialize(url,only_tags=nil) 122 def initialize(url,only_tags=nil)
123 @url = url 123 @url = url
124 content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("") 124 content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")
125 super content.strip_javascripts.strip_stylesheets.strip_xml_tags 125 super content.strip_javascripts.strip_xml_tags
126 end 126 end
127 end 127 end
128 128
129 # A WikipediaPage is a WebDocument. 129 # A WikipediaPage is a WebDocument.
130 class WikipediaPage < WebDocument 130 class WikipediaPage < WebDocument
131 require 'rexml/document' 131 require 'rexml/document'
132 require 'net/http' 132 require 'net/http'
133 require 'kconv' 133 require 'kconv'
134 134
135 135
136 def self.search_wikipedia_titles(name) 136 def self.search_wikipedia_titles(name)
137 raise ArgumentError, "Bad encoding", name unless name.isutf8 137 raise ArgumentError, "Bad encoding", name unless name.isutf8
138 138
139 res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search'] 139 res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).unaccent.toutf8).elements['api/query/search']
140 140
141 res.collect { |e| e.attributes['title'] } unless res.nil? 141 res.collect { |e| e.attributes['title'] } unless res.nil?
142 end 142 end
143 143
144 def self.get_url(name) 144 def self.get_url(name)
145 raise ArgumentError, "Bad encoding", name unless name.isutf8 145 raise ArgumentError, "Bad encoding", name unless name.isutf8
146 146
147 atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes 147 atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).unaccent.toutf8).elements['api/query/pages/page'].attributes
148 148
149 atts['fullurl'] if atts['missing'].nil? 149 atts['fullurl'] if atts['missing'].nil?
150 end 150 end
151 151
152 def self.search_homepage(name) 152 def self.search_homepage(name)
153 title = WikipediaPage.search_wikipedia_titles name 153 title = WikipediaPage.search_wikipedia_titles name
154 154
155 WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? 155 WikipediaPage.get_url(title[0]) unless title.nil? || title.empty?
156 end 156 end
157 157
158 end 158 end
159 end 159 end
160 160