Commit 3e81fa06a9b8fbedc6ca161cb26b8a1884c93d36

Authored by Romain Deveaud
1 parent 145387519e
Exists in master

an entropy computation example. words in an RIR::Document are now lowercased.

Showing 3 changed files with 12 additions and 9 deletions Inline Diff

File was created 1 require 'rir'
2
3 # Concatenates all lines from one file, without \n
4 readme = File.open('README.markdown').readlines.collect { |l| l.chomp }.join(" ")
5
6 # Creates the document with a string
7 doc = RIR::Document.new readme
8
9 # Outputs all the unique words of the document with their entropy scores
10 p doc.words.collect { |w| "#{w} => #{doc.entropy w}" }
11
1 #!/usr/bin/env ruby 1 #!/usr/bin/env ruby
2 2
3 # This file is a part of an Information Retrieval oriented Ruby library 3 # This file is a part of an Information Retrieval oriented Ruby library
4 # 4 #
5 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> 5 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
6 # 6 #
7 # This program is free software: you can redistribute it and/or modify 7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by 8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or 9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version. 10 # (at your option) any later version.
11 # 11 #
12 # This program is distributed in the hope that it will be useful, 12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of 13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details. 15 # GNU General Public License for more details.
16 # 16 #
17 # You should have received a copy of the GNU General Public License 17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>. 18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19 19
20 # General module for many purposes related to Information Retrieval. 20 # General module for many purposes related to Information Retrieval.
21 module RIR 21 module RIR
22 22
23 # A Document is a bag of words and is constructed from a string. 23 # A Document is a bag of words and is constructed from a string.
24 class Document 24 class Document
25 attr_reader :words, :doc_content 25 attr_reader :words, :doc_content
26 26
27 # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html 27 # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
28 # and the \\W special escape). 28 # and the \\W special escape).
29 # 29 #
30 # Protected function, only meant to by called at the initialization. 30 # Protected function, only meant to by called at the initialization.
31 def format_words 31 def format_words
32 wo = [] 32 wo = []
33 33
34 @doc_content.split.each do |w| 34 @doc_content.split.each do |w|
35 w.split(/\W/).each do |sw| 35 w.split(/\W/).each do |sw|
36 wo.push(sw) if sw =~ /[a-zA-Z]/ 36 wo.push(sw.downcase) if sw =~ /[a-zA-Z]/
37 end 37 end
38 end 38 end
39 39
40 wo 40 wo
41 end 41 end
42 42
43 # Returns an Array containing the +n+-grams (words) from the current Document. 43 # Returns an Array containing the +n+-grams (words) from the current Document.
44 # 44 #
45 # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] 45 # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
46 def ngrams(n) 46 def ngrams(n)
47 window = [] 47 window = []
48 ngrams_array = [] 48 ngrams_array = []
49 49
50 @words.each do |w| 50 @words.each do |w|
51 window.push(w) 51 window.push(w)
52 if window.size == n 52 if window.size == n
53 ngrams_array.push window.join(" ") 53 ngrams_array.push window.join(" ")
54 window.delete_at(0) 54 window.delete_at(0)
55 end 55 end
56 end 56 end
57 57
58 ngrams_array.uniq 58 ngrams_array.uniq
59 end 59 end
60 60
61 # Returns a Hash containing the words and their associated counts in the current Document. 61 # Returns a Hash containing the words and their associated counts in the current Document.
62 # 62 #
63 # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } 63 # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
64 def count_words 64 def count_words
65 counts = Hash.new { |h,k| h[k] = 0 } 65 counts = Hash.new { |h,k| h[k] = 0 }
66 @words.each { |w| counts[w.downcase] += 1 } 66 @words.each { |w| counts[w] += 1 }
67 67
68 counts 68 counts
69 end 69 end
70 70
71 # Computes the entropy of a given string +s+ inside the document. 71 # Computes the entropy of a given string +s+ inside the document.
72 # 72 #
73 # If the string parameter is composed of many words (i.e. tokens separated 73 # If the string parameter is composed of many words (i.e. tokens separated
74 # by whitespace(s)), it is considered as an ngram. 74 # by whitespace(s)), it is considered as an ngram.
75 # 75 #
76 # entropy("guitar") #=> 0.00389919463243839 76 # entropy("guitar") #=> 0.00389919463243839
77 def entropy(s) 77 def entropy(s)
78 en = 0.0 78 en = 0.0
79 counts = self.count_words 79 counts = self.count_words
80 80
81 s.split.each do |w| 81 s.split.each do |w|
82 p_wi = counts[w].to_f/@words.count.to_f 82 p_wi = counts[w].to_f/@words.count.to_f
83 en += p_wi*Math.log2(p_wi) 83 en += p_wi*Math.log2(p_wi)
84 end 84 end
85 85
86 en *= -1 86 en *= -1
87 en 87 en
88 end 88 end
89 89
90 90
91 91
92 def initialize(content) 92 def initialize(content)
93 @doc_content = content 93 @doc_content = content
94 @words = format_words 94 @words = format_words
95 end 95 end
96 96
97 protected :format_words 97 protected :format_words
98 end 98 end
99 99
100 # A WebDocument is a Document with a +url+. 100 # A WebDocument is a Document with a +url+.
101 class WebDocument < Document 101 class WebDocument < Document
102 attr_reader :url 102 attr_reader :url
103 103
104 # Returns the HTML text from the page of a given +url+. 104 # Returns the HTML text from the page of a given +url+.
105 def self.get_content(url) 105 def self.get_content(url)
106 require 'net/http' 106 require 'net/http'
107 Net::HTTP.get(URI.parse(url)) 107 Net::HTTP.get(URI.parse(url))
108 end 108 end
109 109
110 # WebDocument constructor, the content of the Document is the HTML page 110 # WebDocument constructor, the content of the Document is the HTML page
111 # without the tags. 111 # without the tags.
112 def initialize(url) 112 def initialize(url)
113 @url = url 113 @url = url
114 super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags 114 super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags
115 end 115 end
116 end 116 end
117 117
118 # A WikipediaPage is a WebDocument. 118 # A WikipediaPage is a WebDocument.
119 class WikipediaPage < WebDocument 119 class WikipediaPage < WebDocument
120 require 'rexml/document' 120 require 'rexml/document'
121 require 'net/http' 121 require 'net/http'
122 require 'kconv' 122 require 'kconv'
123 123
124 124
125 def self.search_wikipedia_titles(name) 125 def self.search_wikipedia_titles(name)
126 res = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml")).toutf8).elements['api/query/search'] 126 res = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml")).toutf8).elements['api/query/search']
127 127
128 res.collect { |e| e.attributes['title'] } unless res.nil? 128 res.collect { |e| e.attributes['title'] } unless res.nil?
129 end 129 end
130 130
131 def self.get_url(name) 131 def self.get_url(name)
132 atts = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml")).toutf8).elements['api/query/pages/page'].attributes 132 atts = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml")).toutf8).elements['api/query/pages/page'].attributes
133 133
134 atts['fullurl'] if atts['missing'].nil? 134 atts['fullurl'] if atts['missing'].nil?
135 end 135 end
136 136
137 def self.search_homepage(name) 137 def self.search_homepage(name)
138 title = WikipediaPage.search_wikipedia_titles name 138 title = WikipediaPage.search_wikipedia_titles name
139 139
140 begin 140 begin
141 WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? 141 WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?
142 rescue 142 rescue
143 puts title[0] 143 puts title[0]
144 end 144 end
145 end 145 end
146 146
147 # def initialize(name) 147 # def initialize(name)
148 # title = WikipediaPage.search_wikipedia_titles name 148 # title = WikipediaPage.search_wikipedia_titles name
149 # raise ArgumentError, "No page found" if title.empty? 149 # raise ArgumentError, "No page found" if title.empty?
150 # super WikipediaPage.get_url title[0] 150 # super WikipediaPage.get_url title[0]
151 # end 151 # end
152 end 152 end
153 end 153 end
154 154
1 $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib")) 1 $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib"))
2 2
3 require 'rir' 3 require 'rir'
4 4
5 w = RIR::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan") 5 w = RIR::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan")
6 p w.entropy("guitar") 6 p w.entropy("guitar")
7
8 params = RIR::Indri::Parameters.new("path_vers_mon_index")
9 q = RIR::Indri::IndriQuery.new("pouet", "bla", params)
10 puts q
11
12 c = RIR::Corpus.new "/home/romain/INEX/BookTrack/corpus/"
13 puts c.files.size
14 7