Commit 845768f8ac5a1593db356377fcc68208c12efa74
1 parent
175908fe2a
Exists in
master
creating a group of indri queries is possible. added an accent stripping function.
Showing 4 changed files with 116 additions and 17 deletions Side-by-side Diff
examples/entropy.rb
1 | -require 'rir' | |
1 | +require 'mirimiri' | |
2 | 2 | |
3 | 3 | # Concatenates all lines from one file, without \n |
4 | 4 | readme = File.open('README.markdown').readlines.collect { |l| l.chomp }.join(" ") |
5 | 5 | |
6 | 6 | # Creates the document with a string |
7 | -doc = RIR::Document.new readme | |
7 | +doc = Mirimiri::Document.new readme | |
8 | 8 | |
9 | 9 | # Outputs all the unique words of the document with their entropy scores |
10 | 10 | p doc.words.collect { |w| "#{w} => #{doc.entropy w}" } |
lib/mirimiri/document.rb
lib/mirimiri/query.rb
... | ... | @@ -27,7 +27,7 @@ |
27 | 27 | class Parameters |
28 | 28 | attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline |
29 | 29 | |
30 | - def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false) | |
30 | + def initialize(corpus,count="1000",mem="1g",offset="1",run_id="default",print_query=false,print_docs=false) | |
31 | 31 | @index_path = corpus |
32 | 32 | @memory = mem |
33 | 33 | @count = count |
... | ... | @@ -38,8 +38,7 @@ |
38 | 38 | end |
39 | 39 | |
40 | 40 | def to_s |
41 | - h = "<parameters>\n" | |
42 | - h += "<memory>#{@memory}</memory>\n" | |
41 | + h = "<memory>#{@memory}</memory>\n" | |
43 | 42 | h += "<index>#{@index_path}</index>\n" |
44 | 43 | h += "<count>#{@count}</count>\n" |
45 | 44 | unless @baseline.nil? |
... | ... | @@ -47,6 +46,7 @@ |
47 | 46 | else |
48 | 47 | h += "<rule>#{@rule}</rule>\n" |
49 | 48 | end |
49 | + h += "<trecFormat>true</trecFormat>\n" | |
50 | 50 | h += "<queryOffset>#{@offset}</queryOffset>\n" |
51 | 51 | h += "<runID>#{@run_id}</runID>\n" |
52 | 52 | h += "<printQuery>#{@print_query}</printQuery>\n" |
53 | 53 | |
54 | 54 | |
55 | 55 | |
56 | 56 | |
... | ... | @@ -55,27 +55,42 @@ |
55 | 55 | h |
56 | 56 | end |
57 | 57 | end |
58 | - | |
58 | + | |
59 | 59 | class IndriQuery < Query |
60 | - attr_accessor :id, :query, :params, :rule | |
60 | + attr_accessor :id, :query, :rule | |
61 | 61 | |
62 | - def initialize(id,query,params) | |
63 | - @params = params | |
64 | - # Here we set the default retrieval model as Language Modeling | |
65 | - # with a Dirichlet smoothing at 2500. | |
66 | - # TODO: maybe a Rule class... | |
67 | - @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? | |
68 | - | |
62 | + def initialize(id,query) | |
69 | 63 | @id = id |
70 | 64 | @query = query |
71 | 65 | end |
72 | 66 | |
73 | 67 | def to_s |
74 | - h = @params.to_s | |
75 | - h += "<query>\n" | |
68 | + h = "<query>\n" | |
76 | 69 | h += "<number>#{@id}</number>\n" |
77 | 70 | h += "<text>#{@query}</text>\n" |
78 | 71 | h += "</query>\n" |
72 | + | |
73 | + h | |
74 | + end | |
75 | + end | |
76 | + | |
77 | + class IndriQueries | |
78 | + attr_accessor :params, :queries | |
79 | + | |
80 | + def initialize(params,*queries) | |
81 | + @queries = queries | |
82 | + | |
83 | + @params = params | |
84 | + # Here we set the default retrieval model as Language Modeling | |
85 | + # with a Dirichlet smoothing at 2500. | |
86 | + # TODO: maybe a Rule class... | |
87 | + @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? | |
88 | + end | |
89 | + | |
90 | + def to_s | |
91 | + h = "<parameters>\n" | |
92 | + h += @params.to_s | |
93 | + h += @queries.collect { |q| q.to_s }.join "" | |
79 | 94 | h += "</parameters>" |
80 | 95 | |
81 | 96 | h |
lib/mirimiri/string.rb
... | ... | @@ -67,12 +67,96 @@ |
67 | 67 | "yours", "yourself", "yourselves" |
68 | 68 | ] |
69 | 69 | |
70 | + Transmap = { | |
71 | + "\xC3\x80" => "A", "\xC3\x81" => "A", "\xC3\x82" => "A", "\xC3\x83" => "A", | |
72 | + "\xC3\x84" => "A", "\xC3\x85" => "A", "\xC3\x86" => "AE","\xC3\x87" => "C", | |
73 | + "\xC3\x88" => "E", "\xC3\x89" => "E", "\xC3\x8A" => "E", "\xC3\x8B" => "E", | |
74 | + "\xC3\x8C" => "I", "\xC3\x8D" => "I", "\xC3\x8E" => "I", "\xC3\x8F" => "I", | |
75 | + "\xC3\x90" => "D", "\xC3\x91" => "N", "\xC3\x92" => "O", "\xC3\x93" => "O", | |
76 | + "\xC3\x94" => "O", "\xC3\x95" => "O", "\xC3\x96" => "O", "\xC3\x98" => "O", | |
77 | + "\xC3\x99" => "U", "\xC3\x9A" => "U", "\xC3\x9B" => "U", "\xC3\x9C" => "U", | |
78 | + "\xC3\x9D" => "Y", "\xC3\x9E" => "P", "\xC3\x9F" => "ss", | |
79 | + "\xC3\xA0" => "a", "\xC3\xA1" => "a", "\xC3\xA2" => "a", "\xC3\xA3" => "a", | |
80 | + "\xC3\xA4" => "a", "\xC3\xA5" => "a", "\xC3\xA6" => "ae","\xC3\xA7" => "c", | |
81 | + "\xC3\xA8" => "e", "\xC3\xA9" => "e", "\xC3\xAA" => "e", "\xC3\xAB" => "e", | |
82 | + "\xC3\xAC" => "i", "\xC3\xAD" => "i", "\xC3\xAE" => "i", "\xC3\xAF" => "i", | |
83 | + "\xC3\xB0" => "o", "\xC3\xB1" => "n", "\xC3\xB2" => "o", "\xC3\xB3" => "o", | |
84 | + "\xC3\xB4" => "o", "\xC3\xB5" => "o", "\xC3\xB6" => "o", "\xC3\xB8" => "o", | |
85 | + "\xC3\xB9" => "u", "\xC3\xBA" => "u", "\xC3\xBB" => "u", "\xC3\xBC" => "u", | |
86 | + "\xC3\xBD" => "y", "\xC3\xBE" => "p", "\xC3\xBF" => "y", | |
87 | + "\xC4\x80" => "A", "\xC4\x81" => "a", "\xC4\x82" => "A", "\xC4\x83" => "a", | |
88 | + "\xC4\x84" => "A", "\xC4\x85" => "a", "\xC4\x86" => "C", "\xC4\x87" => "c", | |
89 | + "\xC4\x88" => "C", "\xC4\x89" => "c", "\xC4\x8A" => "C", "\xC4\x8B" => "c", | |
90 | + "\xC4\x8C" => "C", "\xC4\x8D" => "c", "\xC4\x8E" => "D", "\xC4\x8F" => "d", | |
91 | + "\xC4\x90" => "D", "\xC4\x91" => "d", "\xC4\x92" => "E", "\xC4\x93" => "e", | |
92 | + "\xC4\x94" => "E", "\xC4\x95" => "e", "\xC4\x96" => "E", "\xC4\x97" => "e", | |
93 | + "\xC4\x98" => "E", "\xC4\x99" => "e", "\xC4\x9A" => "E", "\xC4\x9B" => "e", | |
94 | + "\xC4\x9C" => "G", "\xC4\x9D" => "g", "\xC4\x9E" => "G", "\xC4\x9F" => "g", | |
95 | + "\xC4\xA0" => "G", "\xC4\xA1" => "g", "\xC4\xA2" => "G", "\xC4\xA3" => "g", | |
96 | + "\xC4\xA4" => "H", "\xC4\xA5" => "h", "\xC4\xA6" => "H", "\xC4\xA7" => "h", | |
97 | + "\xC4\xA8" => "I", "\xC4\xA9" => "i", "\xC4\xAA" => "I", "\xC4\xAB" => "i", | |
98 | + "\xC4\xAC" => "I", "\xC4\xAD" => "i", "\xC4\xAE" => "I", "\xC4\xAF" => "i", | |
99 | + "\xC4\xB0" => "I", "\xC4\xB1" => "i", "\xC4\xB2" => "IJ","\xC4\xB3" => "ij", | |
100 | + "\xC4\xB4" => "J", "\xC4\xB5" => "j", "\xC4\xB6" => "K", "\xC4\xB7" => "k", | |
101 | + "\xC4\xB8" => "k", "\xC4\xB9" => "L", "\xC4\xBA" => "l", "\xC4\xBB" => "L", | |
102 | + "\xC4\xBC" => "l", "\xC4\xBD" => "L", "\xC4\xBE" => "l", "\xC4\xBF" => "L", | |
103 | + "\xC5\x80" => "l", "\xC5\x81" => "L", "\xC5\x82" => "l", "\xC5\x83" => "N", | |
104 | + "\xC5\x84" => "n", "\xC5\x85" => "N", "\xC5\x86" => "n", "\xC5\x87" => "N", | |
105 | + "\xC5\x88" => "n", "\xC5\x89" => "n", "\xC5\x8A" => "N", "\xC5\x8B" => "n", | |
106 | + "\xC5\x8C" => "O", "\xC5\x8D" => "o", "\xC5\x8E" => "O", "\xC5\x8F" => "o", | |
107 | + "\xC5\x90" => "O", "\xC5\x91" => "o", "\xC5\x92" => "CE","\xC5\x93" => "ce", | |
108 | + "\xC5\x94" => "R", "\xC5\x95" => "r", "\xC5\x96" => "R", "\xC5\x97" => "r", | |
109 | + "\xC5\x98" => "R", "\xC5\x99" => "r", "\xC5\x9A" => "S", "\xC5\x9B" => "s", | |
110 | + "\xC5\x9C" => "S", "\xC5\x9D" => "s", "\xC5\x9E" => "S", "\xC5\x9F" => "s", | |
111 | + "\xC5\xA0" => "S", "\xC5\xA1" => "s", "\xC5\xA2" => "T", "\xC5\xA3" => "t", | |
112 | + "\xC5\xA4" => "T", "\xC5\xA5" => "t", "\xC5\xA6" => "T", "\xC5\xA7" => "t", | |
113 | + "\xC5\xA8" => "U", "\xC5\xA9" => "u", "\xC5\xAA" => "U", "\xC5\xAB" => "u", | |
114 | + "\xC5\xAC" => "U", "\xC5\xAD" => "u", "\xC5\xAE" => "U", "\xC5\xAF" => "u", | |
115 | + "\xC5\xB0" => "U", "\xC5\xB1" => "u", "\xC5\xB2" => "U", "\xC5\xB3" => "u", | |
116 | + "\xC5\xB4" => "W", "\xC5\xB5" => "w", "\xC5\xB6" => "Y", "\xC5\xB7" => "y", | |
117 | + "\xC5\xB8" => "Y", "\xC5\xB9" => "Z", "\xC5\xBA" => "z", "\xC5\xBB" => "Z", | |
118 | + "\xC5\xBC" => "z", "\xC5\xBD" => "Z", "\xC5\xBE" => "z", "\xC6\x8F" => "E", | |
119 | + "\xC6\xA0" => "O", "\xC6\xA1" => "o", "\xC6\xAF" => "U", "\xC6\xB0" => "u", | |
120 | + "\xC7\x8D" => "A", "\xC7\x8E" => "a", "\xC7\x8F" => "I", | |
121 | + "\xC7\x90" => "i", "\xC7\x91" => "O", "\xC7\x92" => "o", "\xC7\x93" => "U", | |
122 | + "\xC7\x94" => "u", "\xC7\x95" => "U", "\xC7\x96" => "u", "\xC7\x97" => "U", | |
123 | + "\xC7\x98" => "u", "\xC7\x99" => "U", "\xC7\x9A" => "u", "\xC7\x9B" => "U", | |
124 | + "\xC7\x9C" => "u", | |
125 | + "\xC7\xBA" => "A", "\xC7\xBB" => "a", "\xC7\xBC" => "AE","\xC7\xBD" => "ae", | |
126 | + "\xC7\xBE" => "O", "\xC7\xBF" => "o", | |
127 | + "\xC9\x99" => "e", | |
128 | + "\xC2\x82" => ",", # High code comma | |
129 | + "\xC2\x84" => ",,", # High code double comma | |
130 | + "\xC2\x85" => "...", # Tripple dot | |
131 | + "\xC2\x88" => "^", # High carat | |
132 | + "\xC2\x91" => "\x27", # Forward single quote | |
133 | + "\xC2\x92" => "\x27", # Reverse single quote | |
134 | + "\xC2\x93" => "\x22", # Forward double quote | |
135 | + "\xC2\x94" => "\x22", # Reverse double quote | |
136 | + "\xC2\x96" => "-", # High hyphen | |
137 | + "\xC2\x97" => "--", # Double hyphen | |
138 | + "\xC2\xA6" => "|", # Split vertical bar | |
139 | + "\xC2\xAB" => "<<", # Double less than | |
140 | + "\xC2\xBB" => ">>", # Double greater than | |
141 | + "\xC2\xBC" => "1/4", # one quarter | |
142 | + "\xC2\xBD" => "1/2", # one half | |
143 | + "\xC2\xBE" => "3/4", # three quarters | |
144 | + "\xCA\xBF" => "\x27", # c-single quote | |
145 | + "\xCC\xA8" => "", # modifier - under curve | |
146 | + "\xCC\xB1" => "", # modifier - under line | |
147 | + /\W/ => "" | |
148 | + } | |
70 | 149 | |
71 | 150 | end |
72 | 151 | |
73 | 152 | # Extention of the standard class String with useful function. |
74 | 153 | class String |
75 | 154 | include Mirimiri |
155 | + | |
156 | + def unaccent | |
157 | + # force_encoding is needed with ruby1.9 | |
158 | + Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) } | |
159 | + end | |
76 | 160 | |
77 | 161 | # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise. |
78 | 162 | def is_stopword? |