Blame view
lib/mirimiri/document.rb
5.83 KB
cd7432252
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
#!/usr/bin/env ruby #-- # This file is a part of the mirimiri library # # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. #++ # General module module Mirimiri # A Document is a bag of words and is constructed from a string. class Document |
e267264ee
|
28 |
attr_reader :words, :doc_content, :count_words |
cd7432252
|
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
# Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html # and the \\W special escape). # # Protected function, only meant to by called at the initialization. def format_words wo = [] @doc_content.split.each do |w| w.split(/\W/).each do |sw| wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ end end wo end # Returns an Array containing the +n+-grams (words) from the current Document. # # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] def ngrams(n) window = [] ngrams_array = [] @words.each do |w| window.push(w) if window.size == n ngrams_array.push window.join(" ") window.delete_at(0) end end |
e0e33fca0
|
60 |
ngrams_array |
cd7432252
|
61 62 63 64 65 66 67 68 69 70 71 |
end # Returns a Hash containing the words and their associated counts in the current Document. # # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } def count_words counts = Hash.new { |h,k| h[k] = 0 } @words.each { |w| counts[w] += 1 } counts end |
e0e33fca0
|
72 73 74 75 76 77 78 79 80 81 82 83 84 |
# Old entropy function. # TODO: remove. def entropy0(s) en = 0.0 s.split.each do |w| p_wi = @count_words[w].to_f/@words.count.to_f en += p_wi*Math.log2(p_wi) end en *= -1 en end |
cd7432252
|
85 86 87 88 89 |
# Computes the entropy of a given string +s+ inside the document. # # If the string parameter is composed of many words (i.e. tokens separated # by whitespace(s)), it is considered as an ngram. # |
e0e33fca0
|
90 91 |
# entropy("guitar") #=> 0.014348983965324762 # entropy("dillinger escape plan") #=> 0.054976093116768154 |
cd7432252
|
92 93 |
def entropy(s) en = 0.0 |
e0e33fca0
|
94 95 96 97 98 99 100 101 102 103 |
size = s.split.size if size == 1 p_wi = @count_words[s].to_f/@words.count.to_f en += p_wi*Math.log(p_wi) elsif size > 1 ng_size = ngrams(size) p_wi = ng_size.count(s).to_f/ng_size.count.to_f en += p_wi*Math.log(p_wi) |
cd7432252
|
104 105 106 107 108 109 110 111 112 113 |
end en *= -1 en end # Computes the term frequency of a given *word* +s+. # # tf("guitar") #=> 0.000380372765310004 def tf(s) |
8f90ef69c
|
114 |
@count_words[s].to_f/@words.size.to_f |
cd7432252
|
115 |
end |
845768f8a
|
116 |
def initialize(content="") |
cd7432252
|
117 118 |
@doc_content = content @words = format_words |
8f90ef69c
|
119 |
@count_words = count_words |
cd7432252
|
120 |
end |
8f90ef69c
|
121 |
protected :format_words, :count_words |
cd7432252
|
122 123 124 125 126 127 128 129 130 131 132 |
end # A WebDocument is a Document with a +url+. class WebDocument < Document attr_reader :url # Returns the HTML text from the page of a given +url+. def self.get_content(url) require 'net/http' Net::HTTP.get(URI.parse(url)) end |
aa386f553
|
133 |
|
cd7432252
|
134 135 |
# WebDocument constructor, the content of the Document is the HTML page # without the tags. |
b506940c3
|
136 |
def initialize(url,only_tags=nil) |
aa386f553
|
137 |
require 'sanitize' |
cd7432252
|
138 |
@url = url |
b506940c3
|
139 |
content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("") |
aa386f553
|
140 |
super Sanitize.clean(content.unaccent.toutf8.force_encoding("UTF-8"), :remove_contents => ['script']) |
cd7432252
|
141 142 143 144 145 146 147 148 149 150 151 152 |
end end # A WikipediaPage is a WebDocument. class WikipediaPage < WebDocument require 'rexml/document' require 'net/http' require 'kconv' def self.search_wikipedia_titles(name) raise ArgumentError, "Bad encoding", name unless name.isutf8 |
b55f47b38
|
153 |
res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).unaccent.toutf8).elements['api/query/search'] |
cd7432252
|
154 |
|
b55f47b38
|
155 |
res.collect { |e| e.attributes['title'] } unless res.nil? |
cd7432252
|
156 157 158 |
end def self.get_url(name) |
b768fe941
|
159 |
raise ArgumentError, "Bad encoding", name unless name.isutf8 |
cd7432252
|
160 |
|
b55f47b38
|
161 |
atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).unaccent.toutf8).elements['api/query/pages/page'].attributes |
cd7432252
|
162 163 164 165 166 167 |
atts['fullurl'] if atts['missing'].nil? end def self.search_homepage(name) title = WikipediaPage.search_wikipedia_titles name |
b55f47b38
|
168 |
WikipediaPage.get_url(title[0]) unless title.nil? || title.empty? |
cd7432252
|
169 |
end |
aa386f553
|
170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 |
def self.extract_anchors(url) self.get_content(url).extract_xmltags_values('p').join(' ').scan(/<a href="(.+?)" title=.*?>(.+?)<\/a>/).delete_if { |a| a[0] =~ /^\/wiki\/.*$/.negated } end end class FreebasePage < WebDocument require 'net/http' require 'kconv' require 'json' def self.search_article_ids query,limit raise ArgumentError, "Bad encoding", name unless name.isutf8 JSON.parse(Net::HTTP.get( URI.parse "http://api.freebase.com/api/service/search?query=#{query.gsub(" ","+")}&limit=#{limit}" ))['result'].collect { |a| a['article']['id'] unless a['article'].nil? }.compact end def self.get_url id "http://api.freebase.com/api/trans/raw#{id}" end |
cd7432252
|
189 190 |
end end |