Blame view

lib/rir/document.rb 3.33 KB
7043da90b   Romain Deveaud   first commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
  #!/usr/bin/env ruby
  
  # This file is a part of an Information Retrieval oriented Ruby library
  #
  # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
  #
  # This program is free software: you can redistribute it and/or modify
  # it under the terms of the GNU General Public License as published by
  # the Free Software Foundation, either version 3 of the License, or
  # (at your option) any later version.
  #
  # This program is distributed in the hope that it will be useful,
  # but WITHOUT ANY WARRANTY; without even the implied warranty of
  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  # GNU General Public License for more details.
  #
  # You should have received a copy of the GNU General Public License
  # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  
  # General module for many purposes related to Information Retrieval.
  module Rir
  
    # A Document is a bag of words and is constructed from a string.
    class Document
      attr_reader :words, :doc_content
  
      # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
      # and the \\W special escape).
      # 
      # Protected function, only meant to by called at the initialization.                 
      def format_words
        wo = []
  
        @doc_content.split.each do |w|
          w.split(/\W/).each do |sw| 
            wo.push(sw) if sw =~ /[a-zA-Z]/ 
          end
        end
        
        wo
      end
  
      # Returns an Array containing the +n+-grams (words) from the current Document. 
      #
      #   ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
      def ngrams(n)
        window       = []
        ngrams_array = []
  
        @words.each do |w|
          window.push(w)
          if window.size == n
            ngrams_array.push window.join(" ")
            window.delete_at(0)
          end
        end
  
        ngrams_array.uniq
      end
  
      # Returns a Hash containing the words and their associated counts in the current Document.
      #
      #   count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } 
      def count_words
        counts = Hash.new { |h,k| h[k] = 0 }
        @words.each { |w| counts[w.downcase] += 1 }
  
        counts
      end
  
      # Computes the entropy of a given string +s+ inside the document.
      #
      # If the string parameter is composed of many words (i.e. tokens separated
      # by whitespace(s)), it is considered as an ngram.    
      #
      #   entropy("guitar") #=> 0.00389919463243839
      def entropy(s)
        en = 0.0
        counts = self.count_words
  
        s.split.each do |w|
          p_wi = counts[w].to_f/@words.count.to_f
          en += p_wi*Math.log2(p_wi)
        end
  
        en *= -1
        en
      end
  
  
  
      def initialize(content)
        @doc_content = content
        @words = format_words
      end
  
      protected :format_words
    end
  
    # A WebDocument is a Document with a +url+.
    class WebDocument < Document
      attr_reader :url
  
      # Returns the HTML text from the page of a given +url+.
      def self.get_content(url)
        require 'net/http'
        Net::HTTP.get(URI.parse(url))
      end
  
      # WebDocument constructor, the content of the Document is the HTML page
      # without the tags.
      def initialize(url)
        @url = url
        super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags
      end
    end
  
    # A WikipediaPage is a WebDocument.
    class WikipediaPage < WebDocument
    end
  end