From b843bae6b0589e55e85ce55756b97ae3cbd7d6d4 Mon Sep 17 00:00:00 2001 From: Romain Deveaud Date: Sat, 13 Nov 2010 02:51:58 +0100 Subject: [PATCH] new files + tests --- Rakefile | 10 ++++++ lib/rir/corpus.rb | 35 +++++++++++++++++++++ lib/rir/query.rb | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++ test/string_test.rb | 24 +++++++++++++++ 4 files changed, 157 insertions(+) create mode 100644 Rakefile create mode 100644 lib/rir/corpus.rb create mode 100644 lib/rir/query.rb create mode 100644 test/string_test.rb diff --git a/Rakefile b/Rakefile new file mode 100644 index 0000000..c9fbb55 --- /dev/null +++ b/Rakefile @@ -0,0 +1,10 @@ +require 'rake' +require 'rake/testtask' + +Rake::TestTask.new(:test) do |test| + test.libs << 'lib' << 'lib/rir' << 'test' + test.pattern = 'test/**/*_test.rb' + test.verbose = true +end + +task :default => :test diff --git a/lib/rir/corpus.rb b/lib/rir/corpus.rb new file mode 100644 index 0000000..44d2f3f --- /dev/null +++ b/lib/rir/corpus.rb @@ -0,0 +1,35 @@ +#!/usr/bin/env ruby + +# This file is a part of an Information Retrieval oriented Ruby library +# +# Copyright (C) 2010-2011 Romain Deveaud +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# General module for many purposes related to Information Retrieval. +module RIR + + class Corpus + attr_accessor :path + + def initialize(path) + @path = path + end + + def files + Dir.glob("**/*.*") + end + end + +end diff --git a/lib/rir/query.rb b/lib/rir/query.rb new file mode 100644 index 0000000..63ca4ca --- /dev/null +++ b/lib/rir/query.rb @@ -0,0 +1,88 @@ +#!/usr/bin/env ruby + +# This file is a part of an Information Retrieval oriented Ruby library +# +# Copyright (C) 2010-2011 Romain Deveaud +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# General module for many purposes related to Information Retrieval. +module RIR + + class Query + end + + module Indri + + class Parameters + attr_accessor :corpus, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline + + def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false) + @corpus = corpus + @memory = mem + @count = count + @offset = offset + @run_id = run_id + @print_query = print_query ? "true" : "false" + @print_docs = print_docs ? "true" : "false" + end + + def to_s + h = "\n" + h += "#{@memory}\n" + h += "#{@corpus}\n" + h += "#{@count}\n" + unless @baseline.nil? + h += "#{@baseline}\n" + else + h += "#{@rule}\n" + end + h += "#{@offset}\n" + h += "#{@run_id}\n" + h += "#{@print_query}\n" + h += "#{@print_docs}\n" + + h + end + end + + class IndriQuery < Query + attr_accessor :id, :query, :params, :rule + + def initialize(id,query,params) +# @params = Parameters === params ? params : Parameters.new(corpus) + @params = params + # Here we set the default retrieval model as Language Modeling + # with a Dirichlet smoothing at 2500. + # TODO: maybe a Rule class... + @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? + + @id = id + @query = query + end + + def to_s + h = @params.to_s + h += "\n" + h += "#{@id}\n" + h += "#{@query}\n" + h += "\n" + h += "" + + h + end + end + + end +end diff --git a/test/string_test.rb b/test/string_test.rb new file mode 100644 index 0000000..55bbaa4 --- /dev/null +++ b/test/string_test.rb @@ -0,0 +1,24 @@ +#!/usr/bin/env ruby + +require 'test/unit' + +require 'string' + +class TestString < Test::Unit::TestCase + + def test_extract_xml + s = "four-piece in Indianapolis, Indiana at the Murat Theatre" + assert_equal(["Indianapolis", "Indiana"],s.extract_xmltags_values('a')) + end + + def test_stopword + assert_equal(true, "is".is_stopword?) + assert_equal(true, "seen".is_stopword?) + assert_equal(false, "totally".is_stopword?) + assert_equal(false, "Paris".is_stopword?) + end + + def test_strip_xml + assert_equal("testme", "testme".strip_xml_tags) + end +end -- 1.8.2.3