Deveaud Romain / mirimiri

Browse Code »

Commit 3e81fa06a9b8fbedc6ca161cb26b8a1884c93d36

Authored by Romain Deveaud 2010-11-23 18:17:43 +0100

1 parent 145387519e

Exists in master

an entropy computation example. words in an RIR::Document are now lowercased.

Showing 3 changed files with 12 additions and 9 deletions Inline Diff

examples/entropy.rb
lib/rir/document.rb
main.rb

examples/entropy.rb

Diff comments View file @ 3e81fa0

File was created	1	require 'rir'
	2
	3	# Concatenates all lines from one file, without \n
	4	readme = File.open('README.markdown').readlines.collect { \|l\| l.chomp }.join(" ")
	5
	6	# Creates the document with a string
	7	doc = RIR::Document.new readme
	8
	9	# Outputs all the unique words of the document with their entropy scores
	10	p doc.words.collect { \|w\| "#{w} => #{doc.entropy w}" }
	11

lib/rir/document.rb

Diff comments View file @ 3e81fa0

1	#!/usr/bin/env ruby	1	#!/usr/bin/env ruby
2		2
3	# This file is a part of an Information Retrieval oriented Ruby library	3	# This file is a part of an Information Retrieval oriented Ruby library
4	#	4	#
5	# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>	5	# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
6	#	6	#
7	# This program is free software: you can redistribute it and/or modify	7	# This program is free software: you can redistribute it and/or modify
8	# it under the terms of the GNU General Public License as published by	8	# it under the terms of the GNU General Public License as published by
9	# the Free Software Foundation, either version 3 of the License, or	9	# the Free Software Foundation, either version 3 of the License, or
10	# (at your option) any later version.	10	# (at your option) any later version.
11	#	11	#
12	# This program is distributed in the hope that it will be useful,	12	# This program is distributed in the hope that it will be useful,
13	# but WITHOUT ANY WARRANTY; without even the implied warranty of	13	# but WITHOUT ANY WARRANTY; without even the implied warranty of
14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	14	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	# GNU General Public License for more details.	15	# GNU General Public License for more details.
16	#	16	#
17	# You should have received a copy of the GNU General Public License	17	# You should have received a copy of the GNU General Public License
18	# along with this program. If not, see <http://www.gnu.org/licenses/>.	18	# along with this program. If not, see <http://www.gnu.org/licenses/>.
19		19
20	# General module for many purposes related to Information Retrieval.	20	# General module for many purposes related to Information Retrieval.
21	module RIR	21	module RIR
22		22
23	# A Document is a bag of words and is constructed from a string.	23	# A Document is a bag of words and is constructed from a string.
24	class Document	24	class Document
25	attr_reader :words, :doc_content	25	attr_reader :words, :doc_content
26		26
27	# Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html	27	# Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
28	# and the \\W special escape).	28	# and the \\W special escape).
29	#	29	#
30	# Protected function, only meant to by called at the initialization.	30	# Protected function, only meant to by called at the initialization.
31	def format_words	31	def format_words
32	wo = []	32	wo = []
33		33
34	@doc_content.split.each do \|w\|	34	@doc_content.split.each do \|w\|
35	w.split(/\W/).each do \|sw\|	35	w.split(/\W/).each do \|sw\|
36	wo.push(sw) if sw =~ /[a-zA-Z]/	36	wo.push(sw.downcase) if sw =~ /[a-zA-Z]/
37	end	37	end
38	end	38	end
39		39
40	wo	40	wo
41	end	41	end
42		42
43	# Returns an Array containing the +n+-grams (words) from the current Document.	43	# Returns an Array containing the +n+-grams (words) from the current Document.
44	#	44	#
45	# ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]	45	# ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
46	def ngrams(n)	46	def ngrams(n)
47	window = []	47	window = []
48	ngrams_array = []	48	ngrams_array = []
49		49
50	@words.each do \|w\|	50	@words.each do \|w\|
51	window.push(w)	51	window.push(w)
52	if window.size == n	52	if window.size == n
53	ngrams_array.push window.join(" ")	53	ngrams_array.push window.join(" ")
54	window.delete_at(0)	54	window.delete_at(0)
55	end	55	end
56	end	56	end
57		57
58	ngrams_array.uniq	58	ngrams_array.uniq
59	end	59	end
60		60
61	# Returns a Hash containing the words and their associated counts in the current Document.	61	# Returns a Hash containing the words and their associated counts in the current Document.
62	#	62	#
63	# count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }	63	# count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
64	def count_words	64	def count_words
65	counts = Hash.new { \|h,k\| h[k] = 0 }	65	counts = Hash.new { \|h,k\| h[k] = 0 }
66	@words.each { \|w\| counts[w.downcase] += 1 }	66	@words.each { \|w\| counts[w] += 1 }
67		67
68	counts	68	counts
69	end	69	end
70		70
71	# Computes the entropy of a given string +s+ inside the document.	71	# Computes the entropy of a given string +s+ inside the document.
72	#	72	#
73	# If the string parameter is composed of many words (i.e. tokens separated	73	# If the string parameter is composed of many words (i.e. tokens separated
74	# by whitespace(s)), it is considered as an ngram.	74	# by whitespace(s)), it is considered as an ngram.
75	#	75	#
76	# entropy("guitar") #=> 0.00389919463243839	76	# entropy("guitar") #=> 0.00389919463243839
77	def entropy(s)	77	def entropy(s)
78	en = 0.0	78	en = 0.0
79	counts = self.count_words	79	counts = self.count_words
80		80
81	s.split.each do \|w\|	81	s.split.each do \|w\|
82	p_wi = counts[w].to_f/@words.count.to_f	82	p_wi = counts[w].to_f/@words.count.to_f
83	en += p_wi*Math.log2(p_wi)	83	en += p_wi*Math.log2(p_wi)
84	end	84	end
85		85
86	en *= -1	86	en *= -1
87	en	87	en
88	end	88	end
89		89
90		90
91		91
92	def initialize(content)	92	def initialize(content)
93	@doc_content = content	93	@doc_content = content
94	@words = format_words	94	@words = format_words
95	end	95	end
96		96
97	protected :format_words	97	protected :format_words
98	end	98	end
99		99
100	# A WebDocument is a Document with a +url+.	100	# A WebDocument is a Document with a +url+.
101	class WebDocument < Document	101	class WebDocument < Document
102	attr_reader :url	102	attr_reader :url
103		103
104	# Returns the HTML text from the page of a given +url+.	104	# Returns the HTML text from the page of a given +url+.
105	def self.get_content(url)	105	def self.get_content(url)
106	require 'net/http'	106	require 'net/http'
107	Net::HTTP.get(URI.parse(url))	107	Net::HTTP.get(URI.parse(url))
108	end	108	end
109		109
110	# WebDocument constructor, the content of the Document is the HTML page	110	# WebDocument constructor, the content of the Document is the HTML page
111	# without the tags.	111	# without the tags.
112	def initialize(url)	112	def initialize(url)
113	@url = url	113	@url = url
114	super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags	114	super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags
115	end	115	end
116	end	116	end
117		117
118	# A WikipediaPage is a WebDocument.	118	# A WikipediaPage is a WebDocument.
119	class WikipediaPage < WebDocument	119	class WikipediaPage < WebDocument
120	require 'rexml/document'	120	require 'rexml/document'
121	require 'net/http'	121	require 'net/http'
122	require 'kconv'	122	require 'kconv'
123		123
124		124
125	def self.search_wikipedia_titles(name)	125	def self.search_wikipedia_titles(name)
126	res = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml")).toutf8).elements['api/query/search']	126	res = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml")).toutf8).elements['api/query/search']
127		127
128	res.collect { \|e\| e.attributes['title'] } unless res.nil?	128	res.collect { \|e\| e.attributes['title'] } unless res.nil?
129	end	129	end
130		130
131	def self.get_url(name)	131	def self.get_url(name)
132	atts = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml")).toutf8).elements['api/query/pages/page'].attributes	132	atts = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml")).toutf8).elements['api/query/pages/page'].attributes
133		133
134	atts['fullurl'] if atts['missing'].nil?	134	atts['fullurl'] if atts['missing'].nil?
135	end	135	end
136		136
137	def self.search_homepage(name)	137	def self.search_homepage(name)
138	title = WikipediaPage.search_wikipedia_titles name	138	title = WikipediaPage.search_wikipedia_titles name
139		139
140	begin	140	begin
141	WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? \|\| title.empty?	141	WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? \|\| title.empty?
142	rescue	142	rescue
143	puts title[0]	143	puts title[0]
144	end	144	end
145	end	145	end
146		146
147	# def initialize(name)	147	# def initialize(name)
148	# title = WikipediaPage.search_wikipedia_titles name	148	# title = WikipediaPage.search_wikipedia_titles name
149	# raise ArgumentError, "No page found" if title.empty?	149	# raise ArgumentError, "No page found" if title.empty?
150	# super WikipediaPage.get_url title[0]	150	# super WikipediaPage.get_url title[0]
151	# end	151	# end
152	end	152	end
153	end	153	end
154		154

main.rb

Diff comments View file @ 3e81fa0

 $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib"))
 require 'rir'
 w = RIR::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan")
 p w.entropy("guitar")
-params = RIR::Indri::Parameters.new("path_vers_mon_index")
-q = RIR::Indri::IndriQuery.new("pouet", "bla", params)
-puts q
-c = RIR::Corpus.new "/home/romain/INEX/BookTrack/corpus/"
-puts c.files.size