Blame view
lib/rir/document.rb
4.75 KB
7043da90b first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
#!/usr/bin/env ruby # This file is a part of an Information Retrieval oriented Ruby library # # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # General module for many purposes related to Information Retrieval. |
fd4cb285a doc changes + doc... |
21 |
module RIR |
7043da90b first commit |
22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# A Document is a bag of words and is constructed from a string. class Document attr_reader :words, :doc_content # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html # and the \\W special escape). # # Protected function, only meant to by called at the initialization. def format_words wo = [] @doc_content.split.each do |w| w.split(/\W/).each do |sw| |
3e81fa06a an entropy comput... |
36 |
wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ |
7043da90b first commit |
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
end end wo end # Returns an Array containing the +n+-grams (words) from the current Document. # # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] def ngrams(n) window = [] ngrams_array = [] @words.each do |w| window.push(w) if window.size == n ngrams_array.push window.join(" ") window.delete_at(0) end end ngrams_array.uniq end # Returns a Hash containing the words and their associated counts in the current Document. # # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } def count_words counts = Hash.new { |h,k| h[k] = 0 } |
3e81fa06a an entropy comput... |
66 |
@words.each { |w| counts[w] += 1 } |
7043da90b first commit |
67 68 69 70 71 72 73 74 75 |
counts end # Computes the entropy of a given string +s+ inside the document. # # If the string parameter is composed of many words (i.e. tokens separated # by whitespace(s)), it is considered as an ngram. # |
a79a22843 new TreeTagger mo... |
76 77 |
# entropy("guitar") #=> 0.00432114812727959 # entropy("dillinger escape plan") #=> 0.265862076325102 |
7043da90b first commit |
78 79 80 81 82 83 84 85 86 87 88 89 |
def entropy(s) en = 0.0 counts = self.count_words s.split.each do |w| p_wi = counts[w].to_f/@words.count.to_f en += p_wi*Math.log2(p_wi) end en *= -1 en end |
a79a22843 new TreeTagger mo... |
90 91 92 93 94 95 |
# Computes the term frequency of a given *word* +s+. # # tf("guitar") #=> 0.000380372765310004 def tf(s) self.count_words[s].to_f/@words.size.to_f end |
7043da90b first commit |
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
def initialize(content) @doc_content = content @words = format_words end protected :format_words end # A WebDocument is a Document with a +url+. class WebDocument < Document attr_reader :url # Returns the HTML text from the page of a given +url+. def self.get_content(url) require 'net/http' Net::HTTP.get(URI.parse(url)) end # WebDocument constructor, the content of the Document is the HTML page # without the tags. def initialize(url) @url = url super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags end end # A WikipediaPage is a WebDocument. class WikipediaPage < WebDocument |
145387519 new stuff with wi... |
126 127 128 129 130 131 |
require 'rexml/document' require 'net/http' require 'kconv' def self.search_wikipedia_titles(name) |
a79a22843 new TreeTagger mo... |
132 133 134 |
raise ArgumentError, "Bad encoding", name unless name.isutf8 res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search'] |
145387519 new stuff with wi... |
135 136 137 138 139 |
res.collect { |e| e.attributes['title'] } unless res.nil? end def self.get_url(name) |
a79a22843 new TreeTagger mo... |
140 141 142 |
raise ArgumentError, "Bad encoding", name unless name.isutf8 atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes |
145387519 new stuff with wi... |
143 144 145 146 147 148 |
atts['fullurl'] if atts['missing'].nil? end def self.search_homepage(name) title = WikipediaPage.search_wikipedia_titles name |
a79a22843 new TreeTagger mo... |
149 |
WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? |
145387519 new stuff with wi... |
150 151 152 153 154 155 156 |
end # def initialize(name) # title = WikipediaPage.search_wikipedia_titles name # raise ArgumentError, "No page found" if title.empty? # super WikipediaPage.get_url title[0] # end |
7043da90b first commit |
157 158 |
end end |