Blame view
lib/mirimiri/document.rb
5.42 KB
cd7432252
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 |
#!/usr/bin/env ruby #-- # This file is a part of the mirimiri library # # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. #++ # General module module Mirimiri # A Document is a bag of words and is constructed from a string. class Document |
e267264ee
|
28 |
attr_reader :words, :doc_content, :count_words |
cd7432252
|
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
# Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html # and the \\W special escape). # # Protected function, only meant to by called at the initialization. def format_words wo = [] @doc_content.split.each do |w| w.split(/\W/).each do |sw| wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ end end wo end # Returns an Array containing the +n+-grams (words) from the current Document. # # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] def ngrams(n) window = [] ngrams_array = [] @words.each do |w| window.push(w) if window.size == n ngrams_array.push window.join(" ") window.delete_at(0) end end ngrams_array.uniq end # Returns a Hash containing the words and their associated counts in the current Document. # # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } def count_words counts = Hash.new { |h,k| h[k] = 0 } @words.each { |w| counts[w] += 1 } counts end # Computes the entropy of a given string +s+ inside the document. # # If the string parameter is composed of many words (i.e. tokens separated # by whitespace(s)), it is considered as an ngram. # # entropy("guitar") #=> 0.00432114812727959 # entropy("dillinger escape plan") #=> 0.265862076325102 def entropy(s) en = 0.0 |
cd7432252
|
83 84 |
s.split.each do |w| |
8f90ef69c
|
85 |
p_wi = @count_words[w].to_f/@words.count.to_f |
cd7432252
|
86 87 88 89 90 91 92 93 94 95 96 |
en += p_wi*Math.log2(p_wi) end en *= -1 en end # Computes the term frequency of a given *word* +s+. # # tf("guitar") #=> 0.000380372765310004 def tf(s) |
8f90ef69c
|
97 |
@count_words[s].to_f/@words.size.to_f |
cd7432252
|
98 |
end |
845768f8a
|
99 |
def initialize(content="") |
cd7432252
|
100 101 |
@doc_content = content @words = format_words |
8f90ef69c
|
102 |
@count_words = count_words |
cd7432252
|
103 |
end |
8f90ef69c
|
104 |
protected :format_words, :count_words |
cd7432252
|
105 106 107 108 109 110 111 112 113 114 115 |
end # A WebDocument is a Document with a +url+. class WebDocument < Document attr_reader :url # Returns the HTML text from the page of a given +url+. def self.get_content(url) require 'net/http' Net::HTTP.get(URI.parse(url)) end |
aa386f553
|
116 |
|
cd7432252
|
117 118 |
# WebDocument constructor, the content of the Document is the HTML page # without the tags. |
b506940c3
|
119 |
def initialize(url,only_tags=nil) |
aa386f553
|
120 |
require 'sanitize' |
cd7432252
|
121 |
@url = url |
b506940c3
|
122 |
content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("") |
aa386f553
|
123 |
super Sanitize.clean(content.unaccent.toutf8.force_encoding("UTF-8"), :remove_contents => ['script']) |
cd7432252
|
124 125 126 127 128 129 130 131 132 133 134 135 |
end end # A WikipediaPage is a WebDocument. class WikipediaPage < WebDocument require 'rexml/document' require 'net/http' require 'kconv' def self.search_wikipedia_titles(name) raise ArgumentError, "Bad encoding", name unless name.isutf8 |
b55f47b38
|
136 |
res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).unaccent.toutf8).elements['api/query/search'] |
cd7432252
|
137 |
|
b55f47b38
|
138 |
res.collect { |e| e.attributes['title'] } unless res.nil? |
cd7432252
|
139 140 141 |
end def self.get_url(name) |
b768fe941
|
142 |
raise ArgumentError, "Bad encoding", name unless name.isutf8 |
cd7432252
|
143 |
|
b55f47b38
|
144 |
atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).unaccent.toutf8).elements['api/query/pages/page'].attributes |
cd7432252
|
145 146 147 148 149 150 |
atts['fullurl'] if atts['missing'].nil? end def self.search_homepage(name) title = WikipediaPage.search_wikipedia_titles name |
b55f47b38
|
151 |
WikipediaPage.get_url(title[0]) unless title.nil? || title.empty? |
cd7432252
|
152 |
end |
aa386f553
|
153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 |
def self.extract_anchors(url) self.get_content(url).extract_xmltags_values('p').join(' ').scan(/<a href="(.+?)" title=.*?>(.+?)<\/a>/).delete_if { |a| a[0] =~ /^\/wiki\/.*$/.negated } end end class FreebasePage < WebDocument require 'net/http' require 'kconv' require 'json' def self.search_article_ids query,limit raise ArgumentError, "Bad encoding", name unless name.isutf8 JSON.parse(Net::HTTP.get( URI.parse "http://api.freebase.com/api/service/search?query=#{query.gsub(" ","+")}&limit=#{limit}" ))['result'].collect { |a| a['article']['id'] unless a['article'].nil? }.compact end def self.get_url id "http://api.freebase.com/api/trans/raw#{id}" end |
cd7432252
|
172 173 |
end end |