diff --git a/examples/entropy.rb b/examples/entropy.rb index 0dc36dd..2eb8071 100644 --- a/examples/entropy.rb +++ b/examples/entropy.rb @@ -1,10 +1,10 @@ -require 'rir' +require 'mirimiri' # Concatenates all lines from one file, without \n readme = File.open('README.markdown').readlines.collect { |l| l.chomp }.join(" ") # Creates the document with a string -doc = RIR::Document.new readme +doc = Mirimiri::Document.new readme # Outputs all the unique words of the document with their entropy scores p doc.words.collect { |w| "#{w} => #{doc.entropy w}" } diff --git a/lib/mirimiri/document.rb b/lib/mirimiri/document.rb index 20752c3..88fc60d 100644 --- a/lib/mirimiri/document.rb +++ b/lib/mirimiri/document.rb @@ -99,7 +99,7 @@ module Mirimiri end - def initialize(content) + def initialize(content="") @doc_content = content @words = format_words end diff --git a/lib/mirimiri/query.rb b/lib/mirimiri/query.rb index 28d6653..511cf43 100644 --- a/lib/mirimiri/query.rb +++ b/lib/mirimiri/query.rb @@ -27,7 +27,7 @@ module Indri class Parameters attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline - def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false) + def initialize(corpus,count="1000",mem="1g",offset="1",run_id="default",print_query=false,print_docs=false) @index_path = corpus @memory = mem @count = count @@ -38,8 +38,7 @@ module Indri end def to_s - h = "\n" - h += "#{@memory}\n" + h = "#{@memory}\n" h += "#{@index_path}\n" h += "#{@count}\n" unless @baseline.nil? @@ -47,6 +46,7 @@ module Indri else h += "#{@rule}\n" end + h += "true\n" h += "#{@offset}\n" h += "#{@run_id}\n" h += "#{@print_query}\n" @@ -55,27 +55,42 @@ module Indri h end end - - class IndriQuery < Query - attr_accessor :id, :query, :params, :rule - def initialize(id,query,params) - @params = params - # Here we set the default retrieval model as Language Modeling - # with a Dirichlet smoothing at 2500. - # TODO: maybe a Rule class... - @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? + class IndriQuery < Query + attr_accessor :id, :query, :rule + def initialize(id,query) @id = id @query = query end def to_s - h = @params.to_s - h += "\n" + h = "\n" h += "#{@id}\n" h += "#{@query}\n" h += "\n" + + h + end + end + + class IndriQueries + attr_accessor :params, :queries + + def initialize(params,*queries) + @queries = queries + + @params = params + # Here we set the default retrieval model as Language Modeling + # with a Dirichlet smoothing at 2500. + # TODO: maybe a Rule class... + @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? + end + + def to_s + h = "\n" + h += @params.to_s + h += @queries.collect { |q| q.to_s }.join "" h += "" h diff --git a/lib/mirimiri/string.rb b/lib/mirimiri/string.rb index 11d73d9..13eee6c 100644 --- a/lib/mirimiri/string.rb +++ b/lib/mirimiri/string.rb @@ -67,6 +67,85 @@ module Mirimiri "yours", "yourself", "yourselves" ] + Transmap = { + "\xC3\x80" => "A", "\xC3\x81" => "A", "\xC3\x82" => "A", "\xC3\x83" => "A", + "\xC3\x84" => "A", "\xC3\x85" => "A", "\xC3\x86" => "AE","\xC3\x87" => "C", + "\xC3\x88" => "E", "\xC3\x89" => "E", "\xC3\x8A" => "E", "\xC3\x8B" => "E", + "\xC3\x8C" => "I", "\xC3\x8D" => "I", "\xC3\x8E" => "I", "\xC3\x8F" => "I", + "\xC3\x90" => "D", "\xC3\x91" => "N", "\xC3\x92" => "O", "\xC3\x93" => "O", + "\xC3\x94" => "O", "\xC3\x95" => "O", "\xC3\x96" => "O", "\xC3\x98" => "O", + "\xC3\x99" => "U", "\xC3\x9A" => "U", "\xC3\x9B" => "U", "\xC3\x9C" => "U", + "\xC3\x9D" => "Y", "\xC3\x9E" => "P", "\xC3\x9F" => "ss", + "\xC3\xA0" => "a", "\xC3\xA1" => "a", "\xC3\xA2" => "a", "\xC3\xA3" => "a", + "\xC3\xA4" => "a", "\xC3\xA5" => "a", "\xC3\xA6" => "ae","\xC3\xA7" => "c", + "\xC3\xA8" => "e", "\xC3\xA9" => "e", "\xC3\xAA" => "e", "\xC3\xAB" => "e", + "\xC3\xAC" => "i", "\xC3\xAD" => "i", "\xC3\xAE" => "i", "\xC3\xAF" => "i", + "\xC3\xB0" => "o", "\xC3\xB1" => "n", "\xC3\xB2" => "o", "\xC3\xB3" => "o", + "\xC3\xB4" => "o", "\xC3\xB5" => "o", "\xC3\xB6" => "o", "\xC3\xB8" => "o", + "\xC3\xB9" => "u", "\xC3\xBA" => "u", "\xC3\xBB" => "u", "\xC3\xBC" => "u", + "\xC3\xBD" => "y", "\xC3\xBE" => "p", "\xC3\xBF" => "y", + "\xC4\x80" => "A", "\xC4\x81" => "a", "\xC4\x82" => "A", "\xC4\x83" => "a", + "\xC4\x84" => "A", "\xC4\x85" => "a", "\xC4\x86" => "C", "\xC4\x87" => "c", + "\xC4\x88" => "C", "\xC4\x89" => "c", "\xC4\x8A" => "C", "\xC4\x8B" => "c", + "\xC4\x8C" => "C", "\xC4\x8D" => "c", "\xC4\x8E" => "D", "\xC4\x8F" => "d", + "\xC4\x90" => "D", "\xC4\x91" => "d", "\xC4\x92" => "E", "\xC4\x93" => "e", + "\xC4\x94" => "E", "\xC4\x95" => "e", "\xC4\x96" => "E", "\xC4\x97" => "e", + "\xC4\x98" => "E", "\xC4\x99" => "e", "\xC4\x9A" => "E", "\xC4\x9B" => "e", + "\xC4\x9C" => "G", "\xC4\x9D" => "g", "\xC4\x9E" => "G", "\xC4\x9F" => "g", + "\xC4\xA0" => "G", "\xC4\xA1" => "g", "\xC4\xA2" => "G", "\xC4\xA3" => "g", + "\xC4\xA4" => "H", "\xC4\xA5" => "h", "\xC4\xA6" => "H", "\xC4\xA7" => "h", + "\xC4\xA8" => "I", "\xC4\xA9" => "i", "\xC4\xAA" => "I", "\xC4\xAB" => "i", + "\xC4\xAC" => "I", "\xC4\xAD" => "i", "\xC4\xAE" => "I", "\xC4\xAF" => "i", + "\xC4\xB0" => "I", "\xC4\xB1" => "i", "\xC4\xB2" => "IJ","\xC4\xB3" => "ij", + "\xC4\xB4" => "J", "\xC4\xB5" => "j", "\xC4\xB6" => "K", "\xC4\xB7" => "k", + "\xC4\xB8" => "k", "\xC4\xB9" => "L", "\xC4\xBA" => "l", "\xC4\xBB" => "L", + "\xC4\xBC" => "l", "\xC4\xBD" => "L", "\xC4\xBE" => "l", "\xC4\xBF" => "L", + "\xC5\x80" => "l", "\xC5\x81" => "L", "\xC5\x82" => "l", "\xC5\x83" => "N", + "\xC5\x84" => "n", "\xC5\x85" => "N", "\xC5\x86" => "n", "\xC5\x87" => "N", + "\xC5\x88" => "n", "\xC5\x89" => "n", "\xC5\x8A" => "N", "\xC5\x8B" => "n", + "\xC5\x8C" => "O", "\xC5\x8D" => "o", "\xC5\x8E" => "O", "\xC5\x8F" => "o", + "\xC5\x90" => "O", "\xC5\x91" => "o", "\xC5\x92" => "CE","\xC5\x93" => "ce", + "\xC5\x94" => "R", "\xC5\x95" => "r", "\xC5\x96" => "R", "\xC5\x97" => "r", + "\xC5\x98" => "R", "\xC5\x99" => "r", "\xC5\x9A" => "S", "\xC5\x9B" => "s", + "\xC5\x9C" => "S", "\xC5\x9D" => "s", "\xC5\x9E" => "S", "\xC5\x9F" => "s", + "\xC5\xA0" => "S", "\xC5\xA1" => "s", "\xC5\xA2" => "T", "\xC5\xA3" => "t", + "\xC5\xA4" => "T", "\xC5\xA5" => "t", "\xC5\xA6" => "T", "\xC5\xA7" => "t", + "\xC5\xA8" => "U", "\xC5\xA9" => "u", "\xC5\xAA" => "U", "\xC5\xAB" => "u", + "\xC5\xAC" => "U", "\xC5\xAD" => "u", "\xC5\xAE" => "U", "\xC5\xAF" => "u", + "\xC5\xB0" => "U", "\xC5\xB1" => "u", "\xC5\xB2" => "U", "\xC5\xB3" => "u", + "\xC5\xB4" => "W", "\xC5\xB5" => "w", "\xC5\xB6" => "Y", "\xC5\xB7" => "y", + "\xC5\xB8" => "Y", "\xC5\xB9" => "Z", "\xC5\xBA" => "z", "\xC5\xBB" => "Z", + "\xC5\xBC" => "z", "\xC5\xBD" => "Z", "\xC5\xBE" => "z", "\xC6\x8F" => "E", + "\xC6\xA0" => "O", "\xC6\xA1" => "o", "\xC6\xAF" => "U", "\xC6\xB0" => "u", + "\xC7\x8D" => "A", "\xC7\x8E" => "a", "\xC7\x8F" => "I", + "\xC7\x90" => "i", "\xC7\x91" => "O", "\xC7\x92" => "o", "\xC7\x93" => "U", + "\xC7\x94" => "u", "\xC7\x95" => "U", "\xC7\x96" => "u", "\xC7\x97" => "U", + "\xC7\x98" => "u", "\xC7\x99" => "U", "\xC7\x9A" => "u", "\xC7\x9B" => "U", + "\xC7\x9C" => "u", + "\xC7\xBA" => "A", "\xC7\xBB" => "a", "\xC7\xBC" => "AE","\xC7\xBD" => "ae", + "\xC7\xBE" => "O", "\xC7\xBF" => "o", + "\xC9\x99" => "e", + "\xC2\x82" => ",", # High code comma + "\xC2\x84" => ",,", # High code double comma + "\xC2\x85" => "...", # Tripple dot + "\xC2\x88" => "^", # High carat + "\xC2\x91" => "\x27", # Forward single quote + "\xC2\x92" => "\x27", # Reverse single quote + "\xC2\x93" => "\x22", # Forward double quote + "\xC2\x94" => "\x22", # Reverse double quote + "\xC2\x96" => "-", # High hyphen + "\xC2\x97" => "--", # Double hyphen + "\xC2\xA6" => "|", # Split vertical bar + "\xC2\xAB" => "<<", # Double less than + "\xC2\xBB" => ">>", # Double greater than + "\xC2\xBC" => "1/4", # one quarter + "\xC2\xBD" => "1/2", # one half + "\xC2\xBE" => "3/4", # three quarters + "\xCA\xBF" => "\x27", # c-single quote + "\xCC\xA8" => "", # modifier - under curve + "\xCC\xB1" => "", # modifier - under line + /\W/ => "" + } end @@ -74,6 +153,11 @@ end class String include Mirimiri + def unaccent + # force_encoding is needed with ruby1.9 + Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) } + end + # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise. def is_stopword? Stoplist.include?(self.downcase)