Commit b843bae6b0589e55e85ce55756b97ae3cbd7d6d4
1 parent
35f45ab54d
Exists in
master
new files + tests
Showing 4 changed files with 157 additions and 0 deletions Inline Diff
Rakefile
File was created | 1 | require 'rake' | |
2 | require 'rake/testtask' | ||
3 | |||
4 | Rake::TestTask.new(:test) do |test| | ||
5 | test.libs << 'lib' << 'lib/rir' << 'test' | ||
6 | test.pattern = 'test/**/*_test.rb' | ||
7 | test.verbose = true | ||
8 | end | ||
9 | |||
10 | task :default => :test | ||
11 |
lib/rir/corpus.rb
File was created | 1 | #!/usr/bin/env ruby | |
2 | |||
3 | # This file is a part of an Information Retrieval oriented Ruby library | ||
4 | # | ||
5 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | ||
6 | # | ||
7 | # This program is free software: you can redistribute it and/or modify | ||
8 | # it under the terms of the GNU General Public License as published by | ||
9 | # the Free Software Foundation, either version 3 of the License, or | ||
10 | # (at your option) any later version. | ||
11 | # | ||
12 | # This program is distributed in the hope that it will be useful, | ||
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | # GNU General Public License for more details. | ||
16 | # | ||
17 | # You should have received a copy of the GNU General Public License | ||
18 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
19 | |||
20 | # General module for many purposes related to Information Retrieval. | ||
21 | module RIR | ||
22 | |||
23 | class Corpus | ||
24 | attr_accessor :path | ||
25 | |||
26 | def initialize(path) | ||
27 | @path = path | ||
28 | end | ||
29 | |||
30 | def files | ||
31 | Dir.glob("**/*.*") | ||
32 | end | ||
33 | end | ||
34 | |||
35 | end | ||
36 |
lib/rir/query.rb
File was created | 1 | #!/usr/bin/env ruby | |
2 | |||
3 | # This file is a part of an Information Retrieval oriented Ruby library | ||
4 | # | ||
5 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | ||
6 | # | ||
7 | # This program is free software: you can redistribute it and/or modify | ||
8 | # it under the terms of the GNU General Public License as published by | ||
9 | # the Free Software Foundation, either version 3 of the License, or | ||
10 | # (at your option) any later version. | ||
11 | # | ||
12 | # This program is distributed in the hope that it will be useful, | ||
13 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
14 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
15 | # GNU General Public License for more details. | ||
16 | # | ||
17 | # You should have received a copy of the GNU General Public License | ||
18 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
19 | |||
20 | # General module for many purposes related to Information Retrieval. | ||
21 | module RIR | ||
22 | |||
23 | class Query | ||
24 | end | ||
25 | |||
26 | module Indri | ||
27 | |||
28 | class Parameters | ||
29 | attr_accessor :corpus, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline | ||
30 | |||
31 | def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false) | ||
32 | @corpus = corpus | ||
33 | @memory = mem | ||
34 | @count = count | ||
35 | @offset = offset | ||
36 | @run_id = run_id | ||
37 | @print_query = print_query ? "true" : "false" | ||
38 | @print_docs = print_docs ? "true" : "false" | ||
39 | end | ||
40 | |||
41 | def to_s | ||
42 | h = "<parameters>\n" | ||
43 | h += "<memory>#{@memory}</memory>\n" | ||
44 | h += "<index>#{@corpus}</index>\n" | ||
45 | h += "<count>#{@count}</count>\n" | ||
46 | unless @baseline.nil? | ||
47 | h += "<baseline>#{@baseline}</baseline>\n" | ||
48 | else | ||
49 | h += "<rule>#{@rule}</rule>\n" | ||
50 | end | ||
51 | h += "<queryOffset>#{@offset}</queryOffset>\n" | ||
52 | h += "<runID>#{@run_id}</runID>\n" | ||
53 | h += "<printQuery>#{@print_query}</printQuery>\n" | ||
54 | h += "<printDocuments>#{@print_docs}</printDocuments>\n" | ||
55 | |||
56 | h | ||
57 | end | ||
58 | end | ||
59 | |||
60 | class IndriQuery < Query | ||
61 | attr_accessor :id, :query, :params, :rule | ||
62 | |||
63 | def initialize(id,query,params) | ||
64 | # @params = Parameters === params ? params : Parameters.new(corpus) | ||
65 | @params = params | ||
66 | # Here we set the default retrieval model as Language Modeling | ||
67 | # with a Dirichlet smoothing at 2500. | ||
68 | # TODO: maybe a Rule class... | ||
69 | @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? | ||
70 | |||
71 | @id = id | ||
72 | @query = query | ||
73 | end | ||
74 | |||
75 | def to_s | ||
76 | h = @params.to_s | ||
77 | h += "<query>\n" | ||
78 | h += "<number>#{@id}</number>\n" | ||
79 | h += "<text>#{@query}</text>\n" | ||
80 | h += "</query>\n" | ||
81 | h += "</parameters>" | ||
82 | |||
83 | h | ||
84 | end | ||
85 | end | ||
86 | |||
87 | end | ||
88 | end | ||
89 |
test/string_test.rb
File was created | 1 | #!/usr/bin/env ruby | |
2 | |||
3 | require 'test/unit' | ||
4 | |||
5 | require 'string' | ||
6 | |||
7 | class TestString < Test::Unit::TestCase | ||
8 | |||
9 | def test_extract_xml | ||
10 | s = "four-piece in <a>Indianapolis</a>, <a>Indiana</a> at the Murat Theatre" | ||
11 | assert_equal(["Indianapolis", "Indiana"],s.extract_xmltags_values('a')) | ||
12 | end | ||
13 | |||
14 | def test_stopword | ||
15 | assert_equal(true, "is".is_stopword?) | ||
16 | assert_equal(true, "seen".is_stopword?) | ||
17 | assert_equal(false, "totally".is_stopword?) | ||
18 | assert_equal(false, "Paris".is_stopword?) | ||
19 | end | ||
20 | |||
21 | def test_strip_xml | ||
22 | assert_equal("testme", "<test>testme</test>".strip_xml_tags) | ||
23 | end | ||
24 | end | ||
25 |