Commit 3e81fa06a9b8fbedc6ca161cb26b8a1884c93d36
1 parent
145387519e
Exists in
master
an entropy computation example. words in an RIR::Document are now lowercased.
Showing 3 changed files with 12 additions and 9 deletions Inline Diff
examples/entropy.rb
| File was created | 1 | require 'rir' | |
| 2 | |||
| 3 | # Concatenates all lines from one file, without \n | ||
| 4 | readme = File.open('README.markdown').readlines.collect { |l| l.chomp }.join(" ") | ||
| 5 | |||
| 6 | # Creates the document with a string | ||
| 7 | doc = RIR::Document.new readme | ||
| 8 | |||
| 9 | # Outputs all the unique words of the document with their entropy scores | ||
| 10 | p doc.words.collect { |w| "#{w} => #{doc.entropy w}" } | ||
| 11 |
lib/rir/document.rb
| 1 | #!/usr/bin/env ruby | 1 | #!/usr/bin/env ruby |
| 2 | 2 | ||
| 3 | # This file is a part of an Information Retrieval oriented Ruby library | 3 | # This file is a part of an Information Retrieval oriented Ruby library |
| 4 | # | 4 | # |
| 5 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | 5 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> |
| 6 | # | 6 | # |
| 7 | # This program is free software: you can redistribute it and/or modify | 7 | # This program is free software: you can redistribute it and/or modify |
| 8 | # it under the terms of the GNU General Public License as published by | 8 | # it under the terms of the GNU General Public License as published by |
| 9 | # the Free Software Foundation, either version 3 of the License, or | 9 | # the Free Software Foundation, either version 3 of the License, or |
| 10 | # (at your option) any later version. | 10 | # (at your option) any later version. |
| 11 | # | 11 | # |
| 12 | # This program is distributed in the hope that it will be useful, | 12 | # This program is distributed in the hope that it will be useful, |
| 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | 13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 15 | # GNU General Public License for more details. | 15 | # GNU General Public License for more details. |
| 16 | # | 16 | # |
| 17 | # You should have received a copy of the GNU General Public License | 17 | # You should have received a copy of the GNU General Public License |
| 18 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | 18 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 19 | 19 | ||
| 20 | # General module for many purposes related to Information Retrieval. | 20 | # General module for many purposes related to Information Retrieval. |
| 21 | module RIR | 21 | module RIR |
| 22 | 22 | ||
| 23 | # A Document is a bag of words and is constructed from a string. | 23 | # A Document is a bag of words and is constructed from a string. |
| 24 | class Document | 24 | class Document |
| 25 | attr_reader :words, :doc_content | 25 | attr_reader :words, :doc_content |
| 26 | 26 | ||
| 27 | # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html | 27 | # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html |
| 28 | # and the \\W special escape). | 28 | # and the \\W special escape). |
| 29 | # | 29 | # |
| 30 | # Protected function, only meant to by called at the initialization. | 30 | # Protected function, only meant to by called at the initialization. |
| 31 | def format_words | 31 | def format_words |
| 32 | wo = [] | 32 | wo = [] |
| 33 | 33 | ||
| 34 | @doc_content.split.each do |w| | 34 | @doc_content.split.each do |w| |
| 35 | w.split(/\W/).each do |sw| | 35 | w.split(/\W/).each do |sw| |
| 36 | wo.push(sw) if sw =~ /[a-zA-Z]/ | 36 | wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ |
| 37 | end | 37 | end |
| 38 | end | 38 | end |
| 39 | 39 | ||
| 40 | wo | 40 | wo |
| 41 | end | 41 | end |
| 42 | 42 | ||
| 43 | # Returns an Array containing the +n+-grams (words) from the current Document. | 43 | # Returns an Array containing the +n+-grams (words) from the current Document. |
| 44 | # | 44 | # |
| 45 | # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] | 45 | # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] |
| 46 | def ngrams(n) | 46 | def ngrams(n) |
| 47 | window = [] | 47 | window = [] |
| 48 | ngrams_array = [] | 48 | ngrams_array = [] |
| 49 | 49 | ||
| 50 | @words.each do |w| | 50 | @words.each do |w| |
| 51 | window.push(w) | 51 | window.push(w) |
| 52 | if window.size == n | 52 | if window.size == n |
| 53 | ngrams_array.push window.join(" ") | 53 | ngrams_array.push window.join(" ") |
| 54 | window.delete_at(0) | 54 | window.delete_at(0) |
| 55 | end | 55 | end |
| 56 | end | 56 | end |
| 57 | 57 | ||
| 58 | ngrams_array.uniq | 58 | ngrams_array.uniq |
| 59 | end | 59 | end |
| 60 | 60 | ||
| 61 | # Returns a Hash containing the words and their associated counts in the current Document. | 61 | # Returns a Hash containing the words and their associated counts in the current Document. |
| 62 | # | 62 | # |
| 63 | # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } | 63 | # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } |
| 64 | def count_words | 64 | def count_words |
| 65 | counts = Hash.new { |h,k| h[k] = 0 } | 65 | counts = Hash.new { |h,k| h[k] = 0 } |
| 66 | @words.each { |w| counts[w.downcase] += 1 } | 66 | @words.each { |w| counts[w] += 1 } |
| 67 | 67 | ||
| 68 | counts | 68 | counts |
| 69 | end | 69 | end |
| 70 | 70 | ||
| 71 | # Computes the entropy of a given string +s+ inside the document. | 71 | # Computes the entropy of a given string +s+ inside the document. |
| 72 | # | 72 | # |
| 73 | # If the string parameter is composed of many words (i.e. tokens separated | 73 | # If the string parameter is composed of many words (i.e. tokens separated |
| 74 | # by whitespace(s)), it is considered as an ngram. | 74 | # by whitespace(s)), it is considered as an ngram. |
| 75 | # | 75 | # |
| 76 | # entropy("guitar") #=> 0.00389919463243839 | 76 | # entropy("guitar") #=> 0.00389919463243839 |
| 77 | def entropy(s) | 77 | def entropy(s) |
| 78 | en = 0.0 | 78 | en = 0.0 |
| 79 | counts = self.count_words | 79 | counts = self.count_words |
| 80 | 80 | ||
| 81 | s.split.each do |w| | 81 | s.split.each do |w| |
| 82 | p_wi = counts[w].to_f/@words.count.to_f | 82 | p_wi = counts[w].to_f/@words.count.to_f |
| 83 | en += p_wi*Math.log2(p_wi) | 83 | en += p_wi*Math.log2(p_wi) |
| 84 | end | 84 | end |
| 85 | 85 | ||
| 86 | en *= -1 | 86 | en *= -1 |
| 87 | en | 87 | en |
| 88 | end | 88 | end |
| 89 | 89 | ||
| 90 | 90 | ||
| 91 | 91 | ||
| 92 | def initialize(content) | 92 | def initialize(content) |
| 93 | @doc_content = content | 93 | @doc_content = content |
| 94 | @words = format_words | 94 | @words = format_words |
| 95 | end | 95 | end |
| 96 | 96 | ||
| 97 | protected :format_words | 97 | protected :format_words |
| 98 | end | 98 | end |
| 99 | 99 | ||
| 100 | # A WebDocument is a Document with a +url+. | 100 | # A WebDocument is a Document with a +url+. |
| 101 | class WebDocument < Document | 101 | class WebDocument < Document |
| 102 | attr_reader :url | 102 | attr_reader :url |
| 103 | 103 | ||
| 104 | # Returns the HTML text from the page of a given +url+. | 104 | # Returns the HTML text from the page of a given +url+. |
| 105 | def self.get_content(url) | 105 | def self.get_content(url) |
| 106 | require 'net/http' | 106 | require 'net/http' |
| 107 | Net::HTTP.get(URI.parse(url)) | 107 | Net::HTTP.get(URI.parse(url)) |
| 108 | end | 108 | end |
| 109 | 109 | ||
| 110 | # WebDocument constructor, the content of the Document is the HTML page | 110 | # WebDocument constructor, the content of the Document is the HTML page |
| 111 | # without the tags. | 111 | # without the tags. |
| 112 | def initialize(url) | 112 | def initialize(url) |
| 113 | @url = url | 113 | @url = url |
| 114 | super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags | 114 | super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags |
| 115 | end | 115 | end |
| 116 | end | 116 | end |
| 117 | 117 | ||
| 118 | # A WikipediaPage is a WebDocument. | 118 | # A WikipediaPage is a WebDocument. |
| 119 | class WikipediaPage < WebDocument | 119 | class WikipediaPage < WebDocument |
| 120 | require 'rexml/document' | 120 | require 'rexml/document' |
| 121 | require 'net/http' | 121 | require 'net/http' |
| 122 | require 'kconv' | 122 | require 'kconv' |
| 123 | 123 | ||
| 124 | 124 | ||
| 125 | def self.search_wikipedia_titles(name) | 125 | def self.search_wikipedia_titles(name) |
| 126 | res = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml")).toutf8).elements['api/query/search'] | 126 | res = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml")).toutf8).elements['api/query/search'] |
| 127 | 127 | ||
| 128 | res.collect { |e| e.attributes['title'] } unless res.nil? | 128 | res.collect { |e| e.attributes['title'] } unless res.nil? |
| 129 | end | 129 | end |
| 130 | 130 | ||
| 131 | def self.get_url(name) | 131 | def self.get_url(name) |
| 132 | atts = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml")).toutf8).elements['api/query/pages/page'].attributes | 132 | atts = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml")).toutf8).elements['api/query/pages/page'].attributes |
| 133 | 133 | ||
| 134 | atts['fullurl'] if atts['missing'].nil? | 134 | atts['fullurl'] if atts['missing'].nil? |
| 135 | end | 135 | end |
| 136 | 136 | ||
| 137 | def self.search_homepage(name) | 137 | def self.search_homepage(name) |
| 138 | title = WikipediaPage.search_wikipedia_titles name | 138 | title = WikipediaPage.search_wikipedia_titles name |
| 139 | 139 | ||
| 140 | begin | 140 | begin |
| 141 | WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? | 141 | WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? |
| 142 | rescue | 142 | rescue |
| 143 | puts title[0] | 143 | puts title[0] |
| 144 | end | 144 | end |
| 145 | end | 145 | end |
| 146 | 146 | ||
| 147 | # def initialize(name) | 147 | # def initialize(name) |
| 148 | # title = WikipediaPage.search_wikipedia_titles name | 148 | # title = WikipediaPage.search_wikipedia_titles name |
| 149 | # raise ArgumentError, "No page found" if title.empty? | 149 | # raise ArgumentError, "No page found" if title.empty? |
| 150 | # super WikipediaPage.get_url title[0] | 150 | # super WikipediaPage.get_url title[0] |
| 151 | # end | 151 | # end |
| 152 | end | 152 | end |
| 153 | end | 153 | end |
| 154 | 154 |
main.rb
| 1 | $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib")) | 1 | $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib")) |
| 2 | 2 | ||
| 3 | require 'rir' | 3 | require 'rir' |
| 4 | 4 | ||
| 5 | w = RIR::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan") | 5 | w = RIR::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan") |
| 6 | p w.entropy("guitar") | 6 | p w.entropy("guitar") |
| 7 | |||
| 8 | params = RIR::Indri::Parameters.new("path_vers_mon_index") | ||
| 9 | q = RIR::Indri::IndriQuery.new("pouet", "bla", params) | ||
| 10 | puts q | ||
| 11 | |||
| 12 | c = RIR::Corpus.new "/home/romain/INEX/BookTrack/corpus/" | ||
| 13 | puts c.files.size | ||
| 14 | 7 |