diff --git a/examples/entropy.rb b/examples/entropy.rb new file mode 100644 index 0000000..0dc36dd --- /dev/null +++ b/examples/entropy.rb @@ -0,0 +1,10 @@ +require 'rir' + +# Concatenates all lines from one file, without \n +readme = File.open('README.markdown').readlines.collect { |l| l.chomp }.join(" ") + +# Creates the document with a string +doc = RIR::Document.new readme + +# Outputs all the unique words of the document with their entropy scores +p doc.words.collect { |w| "#{w} => #{doc.entropy w}" } diff --git a/lib/rir/document.rb b/lib/rir/document.rb index 9bd05ae..5bda4e1 100644 --- a/lib/rir/document.rb +++ b/lib/rir/document.rb @@ -33,7 +33,7 @@ module RIR @doc_content.split.each do |w| w.split(/\W/).each do |sw| - wo.push(sw) if sw =~ /[a-zA-Z]/ + wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ end end @@ -63,7 +63,7 @@ module RIR # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } def count_words counts = Hash.new { |h,k| h[k] = 0 } - @words.each { |w| counts[w.downcase] += 1 } + @words.each { |w| counts[w] += 1 } counts end diff --git a/main.rb b/main.rb index f79f17c..87408f7 100644 --- a/main.rb +++ b/main.rb @@ -4,10 +4,3 @@ require 'rir' w = RIR::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan") p w.entropy("guitar") - -params = RIR::Indri::Parameters.new("path_vers_mon_index") -q = RIR::Indri::IndriQuery.new("pouet", "bla", params) -puts q - -c = RIR::Corpus.new "/home/romain/INEX/BookTrack/corpus/" -puts c.files.size