Commit b843bae6b0589e55e85ce55756b97ae3cbd7d6d4

Authored by Romain Deveaud
1 parent 35f45ab54d
Exists in master

new files + tests

Showing 4 changed files with 157 additions and 0 deletions Side-by-side Diff

  1 +require 'rake'
  2 +require 'rake/testtask'
  3 +
  4 +Rake::TestTask.new(:test) do |test|
  5 + test.libs << 'lib' << 'lib/rir' << 'test'
  6 + test.pattern = 'test/**/*_test.rb'
  7 + test.verbose = true
  8 +end
  9 +
  10 +task :default => :test
  1 +#!/usr/bin/env ruby
  2 +
  3 +# This file is a part of an Information Retrieval oriented Ruby library
  4 +#
  5 +# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
  6 +#
  7 +# This program is free software: you can redistribute it and/or modify
  8 +# it under the terms of the GNU General Public License as published by
  9 +# the Free Software Foundation, either version 3 of the License, or
  10 +# (at your option) any later version.
  11 +#
  12 +# This program is distributed in the hope that it will be useful,
  13 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 +# GNU General Public License for more details.
  16 +#
  17 +# You should have received a copy of the GNU General Public License
  18 +# along with this program. If not, see <http://www.gnu.org/licenses/>.
  19 +
  20 +# General module for many purposes related to Information Retrieval.
  21 +module RIR
  22 +
  23 + class Corpus
  24 + attr_accessor :path
  25 +
  26 + def initialize(path)
  27 + @path = path
  28 + end
  29 +
  30 + def files
  31 + Dir.glob("**/*.*")
  32 + end
  33 + end
  34 +
  35 +end
  1 +#!/usr/bin/env ruby
  2 +
  3 +# This file is a part of an Information Retrieval oriented Ruby library
  4 +#
  5 +# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
  6 +#
  7 +# This program is free software: you can redistribute it and/or modify
  8 +# it under the terms of the GNU General Public License as published by
  9 +# the Free Software Foundation, either version 3 of the License, or
  10 +# (at your option) any later version.
  11 +#
  12 +# This program is distributed in the hope that it will be useful,
  13 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
  14 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  15 +# GNU General Public License for more details.
  16 +#
  17 +# You should have received a copy of the GNU General Public License
  18 +# along with this program. If not, see <http://www.gnu.org/licenses/>.
  19 +
  20 +# General module for many purposes related to Information Retrieval.
  21 +module RIR
  22 +
  23 + class Query
  24 + end
  25 +
  26 + module Indri
  27 +
  28 + class Parameters
  29 + attr_accessor :corpus, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline
  30 +
  31 + def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false)
  32 + @corpus = corpus
  33 + @memory = mem
  34 + @count = count
  35 + @offset = offset
  36 + @run_id = run_id
  37 + @print_query = print_query ? "true" : "false"
  38 + @print_docs = print_docs ? "true" : "false"
  39 + end
  40 +
  41 + def to_s
  42 + h = "<parameters>\n"
  43 + h += "<memory>#{@memory}</memory>\n"
  44 + h += "<index>#{@corpus}</index>\n"
  45 + h += "<count>#{@count}</count>\n"
  46 + unless @baseline.nil?
  47 + h += "<baseline>#{@baseline}</baseline>\n"
  48 + else
  49 + h += "<rule>#{@rule}</rule>\n"
  50 + end
  51 + h += "<queryOffset>#{@offset}</queryOffset>\n"
  52 + h += "<runID>#{@run_id}</runID>\n"
  53 + h += "<printQuery>#{@print_query}</printQuery>\n"
  54 + h += "<printDocuments>#{@print_docs}</printDocuments>\n"
  55 +
  56 + h
  57 + end
  58 + end
  59 +
  60 + class IndriQuery < Query
  61 + attr_accessor :id, :query, :params, :rule
  62 +
  63 + def initialize(id,query,params)
  64 +# @params = Parameters === params ? params : Parameters.new(corpus)
  65 + @params = params
  66 + # Here we set the default retrieval model as Language Modeling
  67 + # with a Dirichlet smoothing at 2500.
  68 + # TODO: maybe a Rule class...
  69 + @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil?
  70 +
  71 + @id = id
  72 + @query = query
  73 + end
  74 +
  75 + def to_s
  76 + h = @params.to_s
  77 + h += "<query>\n"
  78 + h += "<number>#{@id}</number>\n"
  79 + h += "<text>#{@query}</text>\n"
  80 + h += "</query>\n"
  81 + h += "</parameters>"
  82 +
  83 + h
  84 + end
  85 + end
  86 +
  87 + end
  88 +end
  1 +#!/usr/bin/env ruby
  2 +
  3 +require 'test/unit'
  4 +
  5 +require 'string'
  6 +
  7 +class TestString < Test::Unit::TestCase
  8 +
  9 + def test_extract_xml
  10 + s = "four-piece in <a>Indianapolis</a>, <a>Indiana</a> at the Murat Theatre"
  11 + assert_equal(["Indianapolis", "Indiana"],s.extract_xmltags_values('a'))
  12 + end
  13 +
  14 + def test_stopword
  15 + assert_equal(true, "is".is_stopword?)
  16 + assert_equal(true, "seen".is_stopword?)
  17 + assert_equal(false, "totally".is_stopword?)
  18 + assert_equal(false, "Paris".is_stopword?)
  19 + end
  20 +
  21 + def test_strip_xml
  22 + assert_equal("testme", "<test>testme</test>".strip_xml_tags)
  23 + end
  24 +end