Commit b843bae6b0589e55e85ce55756b97ae3cbd7d6d4
1 parent
35f45ab54d
Exists in
master
new files + tests
Showing 4 changed files with 157 additions and 0 deletions Side-by-side Diff
Rakefile
lib/rir/corpus.rb
1 | +#!/usr/bin/env ruby | |
2 | + | |
3 | +# This file is a part of an Information Retrieval oriented Ruby library | |
4 | +# | |
5 | +# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | |
6 | +# | |
7 | +# This program is free software: you can redistribute it and/or modify | |
8 | +# it under the terms of the GNU General Public License as published by | |
9 | +# the Free Software Foundation, either version 3 of the License, or | |
10 | +# (at your option) any later version. | |
11 | +# | |
12 | +# This program is distributed in the hope that it will be useful, | |
13 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | +# GNU General Public License for more details. | |
16 | +# | |
17 | +# You should have received a copy of the GNU General Public License | |
18 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
19 | + | |
20 | +# General module for many purposes related to Information Retrieval. | |
21 | +module RIR | |
22 | + | |
23 | + class Corpus | |
24 | + attr_accessor :path | |
25 | + | |
26 | + def initialize(path) | |
27 | + @path = path | |
28 | + end | |
29 | + | |
30 | + def files | |
31 | + Dir.glob("**/*.*") | |
32 | + end | |
33 | + end | |
34 | + | |
35 | +end |
lib/rir/query.rb
1 | +#!/usr/bin/env ruby | |
2 | + | |
3 | +# This file is a part of an Information Retrieval oriented Ruby library | |
4 | +# | |
5 | +# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | |
6 | +# | |
7 | +# This program is free software: you can redistribute it and/or modify | |
8 | +# it under the terms of the GNU General Public License as published by | |
9 | +# the Free Software Foundation, either version 3 of the License, or | |
10 | +# (at your option) any later version. | |
11 | +# | |
12 | +# This program is distributed in the hope that it will be useful, | |
13 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 | +# GNU General Public License for more details. | |
16 | +# | |
17 | +# You should have received a copy of the GNU General Public License | |
18 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
19 | + | |
20 | +# General module for many purposes related to Information Retrieval. | |
21 | +module RIR | |
22 | + | |
23 | + class Query | |
24 | + end | |
25 | + | |
26 | + module Indri | |
27 | + | |
28 | + class Parameters | |
29 | + attr_accessor :corpus, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline | |
30 | + | |
31 | + def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false) | |
32 | + @corpus = corpus | |
33 | + @memory = mem | |
34 | + @count = count | |
35 | + @offset = offset | |
36 | + @run_id = run_id | |
37 | + @print_query = print_query ? "true" : "false" | |
38 | + @print_docs = print_docs ? "true" : "false" | |
39 | + end | |
40 | + | |
41 | + def to_s | |
42 | + h = "<parameters>\n" | |
43 | + h += "<memory>#{@memory}</memory>\n" | |
44 | + h += "<index>#{@corpus}</index>\n" | |
45 | + h += "<count>#{@count}</count>\n" | |
46 | + unless @baseline.nil? | |
47 | + h += "<baseline>#{@baseline}</baseline>\n" | |
48 | + else | |
49 | + h += "<rule>#{@rule}</rule>\n" | |
50 | + end | |
51 | + h += "<queryOffset>#{@offset}</queryOffset>\n" | |
52 | + h += "<runID>#{@run_id}</runID>\n" | |
53 | + h += "<printQuery>#{@print_query}</printQuery>\n" | |
54 | + h += "<printDocuments>#{@print_docs}</printDocuments>\n" | |
55 | + | |
56 | + h | |
57 | + end | |
58 | + end | |
59 | + | |
60 | + class IndriQuery < Query | |
61 | + attr_accessor :id, :query, :params, :rule | |
62 | + | |
63 | + def initialize(id,query,params) | |
64 | +# @params = Parameters === params ? params : Parameters.new(corpus) | |
65 | + @params = params | |
66 | + # Here we set the default retrieval model as Language Modeling | |
67 | + # with a Dirichlet smoothing at 2500. | |
68 | + # TODO: maybe a Rule class... | |
69 | + @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? | |
70 | + | |
71 | + @id = id | |
72 | + @query = query | |
73 | + end | |
74 | + | |
75 | + def to_s | |
76 | + h = @params.to_s | |
77 | + h += "<query>\n" | |
78 | + h += "<number>#{@id}</number>\n" | |
79 | + h += "<text>#{@query}</text>\n" | |
80 | + h += "</query>\n" | |
81 | + h += "</parameters>" | |
82 | + | |
83 | + h | |
84 | + end | |
85 | + end | |
86 | + | |
87 | + end | |
88 | +end |
test/string_test.rb
1 | +#!/usr/bin/env ruby | |
2 | + | |
3 | +require 'test/unit' | |
4 | + | |
5 | +require 'string' | |
6 | + | |
7 | +class TestString < Test::Unit::TestCase | |
8 | + | |
9 | + def test_extract_xml | |
10 | + s = "four-piece in <a>Indianapolis</a>, <a>Indiana</a> at the Murat Theatre" | |
11 | + assert_equal(["Indianapolis", "Indiana"],s.extract_xmltags_values('a')) | |
12 | + end | |
13 | + | |
14 | + def test_stopword | |
15 | + assert_equal(true, "is".is_stopword?) | |
16 | + assert_equal(true, "seen".is_stopword?) | |
17 | + assert_equal(false, "totally".is_stopword?) | |
18 | + assert_equal(false, "Paris".is_stopword?) | |
19 | + end | |
20 | + | |
21 | + def test_strip_xml | |
22 | + assert_equal("testme", "<test>testme</test>".strip_xml_tags) | |
23 | + end | |
24 | +end |