Commit 845768f8ac5a1593db356377fcc68208c12efa74

Authored by Romain Deveaud
1 parent 175908fe2a
Exists in master

creating a group of indri queries is possible. added an accent stripping function.

Showing 4 changed files with 116 additions and 17 deletions Side-by-side Diff

1   -require 'rir'
  1 +require 'mirimiri'
2 2  
3 3 # Concatenates all lines from one file, without \n
4 4 readme = File.open('README.markdown').readlines.collect { |l| l.chomp }.join(" ")
5 5  
6 6 # Creates the document with a string
7   -doc = RIR::Document.new readme
  7 +doc = Mirimiri::Document.new readme
8 8  
9 9 # Outputs all the unique words of the document with their entropy scores
10 10 p doc.words.collect { |w| "#{w} => #{doc.entropy w}" }
lib/mirimiri/document.rb
... ... @@ -99,7 +99,7 @@
99 99 end
100 100  
101 101  
102   - def initialize(content)
  102 + def initialize(content="")
103 103 @doc_content = content
104 104 @words = format_words
105 105 end
lib/mirimiri/query.rb
... ... @@ -27,7 +27,7 @@
27 27 class Parameters
28 28 attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline
29 29  
30   - def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false)
  30 + def initialize(corpus,count="1000",mem="1g",offset="1",run_id="default",print_query=false,print_docs=false)
31 31 @index_path = corpus
32 32 @memory = mem
33 33 @count = count
... ... @@ -38,8 +38,7 @@
38 38 end
39 39  
40 40 def to_s
41   - h = "<parameters>\n"
42   - h += "<memory>#{@memory}</memory>\n"
  41 + h = "<memory>#{@memory}</memory>\n"
43 42 h += "<index>#{@index_path}</index>\n"
44 43 h += "<count>#{@count}</count>\n"
45 44 unless @baseline.nil?
... ... @@ -47,6 +46,7 @@
47 46 else
48 47 h += "<rule>#{@rule}</rule>\n"
49 48 end
  49 + h += "<trecFormat>true</trecFormat>\n"
50 50 h += "<queryOffset>#{@offset}</queryOffset>\n"
51 51 h += "<runID>#{@run_id}</runID>\n"
52 52 h += "<printQuery>#{@print_query}</printQuery>\n"
53 53  
54 54  
55 55  
56 56  
... ... @@ -55,27 +55,42 @@
55 55 h
56 56 end
57 57 end
58   -
  58 +
59 59 class IndriQuery < Query
60   - attr_accessor :id, :query, :params, :rule
  60 + attr_accessor :id, :query, :rule
61 61  
62   - def initialize(id,query,params)
63   - @params = params
64   - # Here we set the default retrieval model as Language Modeling
65   - # with a Dirichlet smoothing at 2500.
66   - # TODO: maybe a Rule class...
67   - @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil?
68   -
  62 + def initialize(id,query)
69 63 @id = id
70 64 @query = query
71 65 end
72 66  
73 67 def to_s
74   - h = @params.to_s
75   - h += "<query>\n"
  68 + h = "<query>\n"
76 69 h += "<number>#{@id}</number>\n"
77 70 h += "<text>#{@query}</text>\n"
78 71 h += "</query>\n"
  72 +
  73 + h
  74 + end
  75 + end
  76 +
  77 + class IndriQueries
  78 + attr_accessor :params, :queries
  79 +
  80 + def initialize(params,*queries)
  81 + @queries = queries
  82 +
  83 + @params = params
  84 + # Here we set the default retrieval model as Language Modeling
  85 + # with a Dirichlet smoothing at 2500.
  86 + # TODO: maybe a Rule class...
  87 + @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil?
  88 + end
  89 +
  90 + def to_s
  91 + h = "<parameters>\n"
  92 + h += @params.to_s
  93 + h += @queries.collect { |q| q.to_s }.join ""
79 94 h += "</parameters>"
80 95  
81 96 h
lib/mirimiri/string.rb
... ... @@ -67,12 +67,96 @@
67 67 "yours", "yourself", "yourselves"
68 68 ]
69 69  
  70 + Transmap = {
  71 + "\xC3\x80" => "A", "\xC3\x81" => "A", "\xC3\x82" => "A", "\xC3\x83" => "A",
  72 + "\xC3\x84" => "A", "\xC3\x85" => "A", "\xC3\x86" => "AE","\xC3\x87" => "C",
  73 + "\xC3\x88" => "E", "\xC3\x89" => "E", "\xC3\x8A" => "E", "\xC3\x8B" => "E",
  74 + "\xC3\x8C" => "I", "\xC3\x8D" => "I", "\xC3\x8E" => "I", "\xC3\x8F" => "I",
  75 + "\xC3\x90" => "D", "\xC3\x91" => "N", "\xC3\x92" => "O", "\xC3\x93" => "O",
  76 + "\xC3\x94" => "O", "\xC3\x95" => "O", "\xC3\x96" => "O", "\xC3\x98" => "O",
  77 + "\xC3\x99" => "U", "\xC3\x9A" => "U", "\xC3\x9B" => "U", "\xC3\x9C" => "U",
  78 + "\xC3\x9D" => "Y", "\xC3\x9E" => "P", "\xC3\x9F" => "ss",
  79 + "\xC3\xA0" => "a", "\xC3\xA1" => "a", "\xC3\xA2" => "a", "\xC3\xA3" => "a",
  80 + "\xC3\xA4" => "a", "\xC3\xA5" => "a", "\xC3\xA6" => "ae","\xC3\xA7" => "c",
  81 + "\xC3\xA8" => "e", "\xC3\xA9" => "e", "\xC3\xAA" => "e", "\xC3\xAB" => "e",
  82 + "\xC3\xAC" => "i", "\xC3\xAD" => "i", "\xC3\xAE" => "i", "\xC3\xAF" => "i",
  83 + "\xC3\xB0" => "o", "\xC3\xB1" => "n", "\xC3\xB2" => "o", "\xC3\xB3" => "o",
  84 + "\xC3\xB4" => "o", "\xC3\xB5" => "o", "\xC3\xB6" => "o", "\xC3\xB8" => "o",
  85 + "\xC3\xB9" => "u", "\xC3\xBA" => "u", "\xC3\xBB" => "u", "\xC3\xBC" => "u",
  86 + "\xC3\xBD" => "y", "\xC3\xBE" => "p", "\xC3\xBF" => "y",
  87 + "\xC4\x80" => "A", "\xC4\x81" => "a", "\xC4\x82" => "A", "\xC4\x83" => "a",
  88 + "\xC4\x84" => "A", "\xC4\x85" => "a", "\xC4\x86" => "C", "\xC4\x87" => "c",
  89 + "\xC4\x88" => "C", "\xC4\x89" => "c", "\xC4\x8A" => "C", "\xC4\x8B" => "c",
  90 + "\xC4\x8C" => "C", "\xC4\x8D" => "c", "\xC4\x8E" => "D", "\xC4\x8F" => "d",
  91 + "\xC4\x90" => "D", "\xC4\x91" => "d", "\xC4\x92" => "E", "\xC4\x93" => "e",
  92 + "\xC4\x94" => "E", "\xC4\x95" => "e", "\xC4\x96" => "E", "\xC4\x97" => "e",
  93 + "\xC4\x98" => "E", "\xC4\x99" => "e", "\xC4\x9A" => "E", "\xC4\x9B" => "e",
  94 + "\xC4\x9C" => "G", "\xC4\x9D" => "g", "\xC4\x9E" => "G", "\xC4\x9F" => "g",
  95 + "\xC4\xA0" => "G", "\xC4\xA1" => "g", "\xC4\xA2" => "G", "\xC4\xA3" => "g",
  96 + "\xC4\xA4" => "H", "\xC4\xA5" => "h", "\xC4\xA6" => "H", "\xC4\xA7" => "h",
  97 + "\xC4\xA8" => "I", "\xC4\xA9" => "i", "\xC4\xAA" => "I", "\xC4\xAB" => "i",
  98 + "\xC4\xAC" => "I", "\xC4\xAD" => "i", "\xC4\xAE" => "I", "\xC4\xAF" => "i",
  99 + "\xC4\xB0" => "I", "\xC4\xB1" => "i", "\xC4\xB2" => "IJ","\xC4\xB3" => "ij",
  100 + "\xC4\xB4" => "J", "\xC4\xB5" => "j", "\xC4\xB6" => "K", "\xC4\xB7" => "k",
  101 + "\xC4\xB8" => "k", "\xC4\xB9" => "L", "\xC4\xBA" => "l", "\xC4\xBB" => "L",
  102 + "\xC4\xBC" => "l", "\xC4\xBD" => "L", "\xC4\xBE" => "l", "\xC4\xBF" => "L",
  103 + "\xC5\x80" => "l", "\xC5\x81" => "L", "\xC5\x82" => "l", "\xC5\x83" => "N",
  104 + "\xC5\x84" => "n", "\xC5\x85" => "N", "\xC5\x86" => "n", "\xC5\x87" => "N",
  105 + "\xC5\x88" => "n", "\xC5\x89" => "n", "\xC5\x8A" => "N", "\xC5\x8B" => "n",
  106 + "\xC5\x8C" => "O", "\xC5\x8D" => "o", "\xC5\x8E" => "O", "\xC5\x8F" => "o",
  107 + "\xC5\x90" => "O", "\xC5\x91" => "o", "\xC5\x92" => "CE","\xC5\x93" => "ce",
  108 + "\xC5\x94" => "R", "\xC5\x95" => "r", "\xC5\x96" => "R", "\xC5\x97" => "r",
  109 + "\xC5\x98" => "R", "\xC5\x99" => "r", "\xC5\x9A" => "S", "\xC5\x9B" => "s",
  110 + "\xC5\x9C" => "S", "\xC5\x9D" => "s", "\xC5\x9E" => "S", "\xC5\x9F" => "s",
  111 + "\xC5\xA0" => "S", "\xC5\xA1" => "s", "\xC5\xA2" => "T", "\xC5\xA3" => "t",
  112 + "\xC5\xA4" => "T", "\xC5\xA5" => "t", "\xC5\xA6" => "T", "\xC5\xA7" => "t",
  113 + "\xC5\xA8" => "U", "\xC5\xA9" => "u", "\xC5\xAA" => "U", "\xC5\xAB" => "u",
  114 + "\xC5\xAC" => "U", "\xC5\xAD" => "u", "\xC5\xAE" => "U", "\xC5\xAF" => "u",
  115 + "\xC5\xB0" => "U", "\xC5\xB1" => "u", "\xC5\xB2" => "U", "\xC5\xB3" => "u",
  116 + "\xC5\xB4" => "W", "\xC5\xB5" => "w", "\xC5\xB6" => "Y", "\xC5\xB7" => "y",
  117 + "\xC5\xB8" => "Y", "\xC5\xB9" => "Z", "\xC5\xBA" => "z", "\xC5\xBB" => "Z",
  118 + "\xC5\xBC" => "z", "\xC5\xBD" => "Z", "\xC5\xBE" => "z", "\xC6\x8F" => "E",
  119 + "\xC6\xA0" => "O", "\xC6\xA1" => "o", "\xC6\xAF" => "U", "\xC6\xB0" => "u",
  120 + "\xC7\x8D" => "A", "\xC7\x8E" => "a", "\xC7\x8F" => "I",
  121 + "\xC7\x90" => "i", "\xC7\x91" => "O", "\xC7\x92" => "o", "\xC7\x93" => "U",
  122 + "\xC7\x94" => "u", "\xC7\x95" => "U", "\xC7\x96" => "u", "\xC7\x97" => "U",
  123 + "\xC7\x98" => "u", "\xC7\x99" => "U", "\xC7\x9A" => "u", "\xC7\x9B" => "U",
  124 + "\xC7\x9C" => "u",
  125 + "\xC7\xBA" => "A", "\xC7\xBB" => "a", "\xC7\xBC" => "AE","\xC7\xBD" => "ae",
  126 + "\xC7\xBE" => "O", "\xC7\xBF" => "o",
  127 + "\xC9\x99" => "e",
  128 + "\xC2\x82" => ",", # High code comma
  129 + "\xC2\x84" => ",,", # High code double comma
  130 + "\xC2\x85" => "...", # Tripple dot
  131 + "\xC2\x88" => "^", # High carat
  132 + "\xC2\x91" => "\x27", # Forward single quote
  133 + "\xC2\x92" => "\x27", # Reverse single quote
  134 + "\xC2\x93" => "\x22", # Forward double quote
  135 + "\xC2\x94" => "\x22", # Reverse double quote
  136 + "\xC2\x96" => "-", # High hyphen
  137 + "\xC2\x97" => "--", # Double hyphen
  138 + "\xC2\xA6" => "|", # Split vertical bar
  139 + "\xC2\xAB" => "<<", # Double less than
  140 + "\xC2\xBB" => ">>", # Double greater than
  141 + "\xC2\xBC" => "1/4", # one quarter
  142 + "\xC2\xBD" => "1/2", # one half
  143 + "\xC2\xBE" => "3/4", # three quarters
  144 + "\xCA\xBF" => "\x27", # c-single quote
  145 + "\xCC\xA8" => "", # modifier - under curve
  146 + "\xCC\xB1" => "", # modifier - under line
  147 + /\W/ => ""
  148 + }
70 149  
71 150 end
72 151  
73 152 # Extention of the standard class String with useful function.
74 153 class String
75 154 include Mirimiri
  155 +
  156 + def unaccent
  157 + # force_encoding is needed with ruby1.9
  158 + Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) }
  159 + end
76 160  
77 161 # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise.
78 162 def is_stopword?