Blame view
lib/rir/document.rb
3.33 KB
7043da90b first commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
#!/usr/bin/env ruby # This file is a part of an Information Retrieval oriented Ruby library # # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # General module for many purposes related to Information Retrieval. module Rir # A Document is a bag of words and is constructed from a string. class Document attr_reader :words, :doc_content # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html # and the \\W special escape). # # Protected function, only meant to by called at the initialization. def format_words wo = [] @doc_content.split.each do |w| w.split(/\W/).each do |sw| wo.push(sw) if sw =~ /[a-zA-Z]/ end end wo end # Returns an Array containing the +n+-grams (words) from the current Document. # # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] def ngrams(n) window = [] ngrams_array = [] @words.each do |w| window.push(w) if window.size == n ngrams_array.push window.join(" ") window.delete_at(0) end end ngrams_array.uniq end # Returns a Hash containing the words and their associated counts in the current Document. # # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } def count_words counts = Hash.new { |h,k| h[k] = 0 } @words.each { |w| counts[w.downcase] += 1 } counts end # Computes the entropy of a given string +s+ inside the document. # # If the string parameter is composed of many words (i.e. tokens separated # by whitespace(s)), it is considered as an ngram. # # entropy("guitar") #=> 0.00389919463243839 def entropy(s) en = 0.0 counts = self.count_words s.split.each do |w| p_wi = counts[w].to_f/@words.count.to_f en += p_wi*Math.log2(p_wi) end en *= -1 en end def initialize(content) @doc_content = content @words = format_words end protected :format_words end # A WebDocument is a Document with a +url+. class WebDocument < Document attr_reader :url # Returns the HTML text from the page of a given +url+. def self.get_content(url) require 'net/http' Net::HTTP.get(URI.parse(url)) end # WebDocument constructor, the content of the Document is the HTML page # without the tags. def initialize(url) @url = url super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags end end # A WikipediaPage is a WebDocument. class WikipediaPage < WebDocument end end |