From 81ed24dd04a9524e59c08248af3647dae5830d56 Mon Sep 17 00:00:00 2001 From: Romain Deveaud Date: Thu, 25 Nov 2010 17:24:05 +0100 Subject: [PATCH] modules revamp. --- doc/classes/Corpus.html | 200 ++++++++++++++++ doc/classes/Corpus.src/M000001.html | 15 ++ doc/classes/Corpus.src/M000002.html | 15 ++ doc/classes/Indri.html | 109 +++++++++ doc/classes/Indri/IndriQuery.html | 219 ++++++++++++++++++ doc/classes/Indri/IndriQuery.src/M000020.html | 22 ++ doc/classes/Indri/IndriQuery.src/M000021.html | 22 ++ doc/classes/Indri/Parameters.html | 255 +++++++++++++++++++++ doc/classes/Indri/Parameters.src/M000018.html | 21 ++ doc/classes/Indri/Parameters.src/M000019.html | 29 +++ doc/classes/Query.html | 110 +++++++++ doc/classes/RIR.html | 130 +---------- doc/classes/RIR/Document.src/M000022.html | 2 +- doc/classes/RIR/Document.src/M000023.html | 2 +- doc/classes/RIR/Document.src/M000024.html | 2 +- doc/classes/RIR/Document.src/M000025.html | 2 +- doc/classes/RIR/Document.src/M000026.html | 2 +- doc/classes/RIR/Document.src/M000027.html | 2 +- doc/classes/RIR/WebDocument.src/M000028.html | 2 +- doc/classes/RIR/WebDocument.src/M000029.html | 2 +- doc/classes/RIR/WikipediaPage.src/M000030.html | 2 +- doc/classes/RIR/WikipediaPage.src/M000031.html | 2 +- doc/classes/RIR/WikipediaPage.src/M000032.html | 2 +- doc/classes/Regexp.html | 37 +-- doc/classes/Regexp.src/M000003.html | 15 ++ doc/classes/String.html | 110 ++++----- doc/classes/String.src/M000004.html | 8 +- doc/classes/String.src/M000005.html | 8 +- doc/classes/String.src/M000006.html | 8 +- doc/classes/String.src/M000007.html | 8 +- doc/classes/String.src/M000008.html | 9 +- doc/classes/String.src/M000009.html | 8 +- doc/classes/String.src/M000010.html | 9 +- doc/classes/String.src/M000011.html | 8 +- doc/classes/String.src/M000012.html | 8 +- doc/classes/String.src/M000013.html | 15 ++ doc/classes/String.src/M000014.html | 15 ++ doc/classes/TreeTagger.html | 123 ++++++++++ doc/classes/TreeTagger/Chunk.html | 195 ++++++++++++++++ doc/classes/TreeTagger/Chunk.src/M000017.html | 16 ++ doc/classes/TreeTagger/TaggerChunker.html | 216 +++++++++++++++++ .../TreeTagger/TaggerChunker.src/M000015.html | 39 ++++ .../TreeTagger/TaggerChunker.src/M000016.html | 15 ++ doc/classes/TreeTagger/TaggerChunkerEnglish.html | 114 +++++++++ doc/classes/TreeTagger/TaggerChunkerFrench.html | 114 +++++++++ doc/classes/TreeTagger/TaggerChunkerGerman.html | 114 +++++++++ doc/created.rid | 2 +- doc/files/lib/rir/corpus_rb.html | 29 +-- doc/files/lib/rir/document_rb.html | 29 +-- doc/files/lib/rir/query_rb.html | 29 +-- doc/files/lib/rir/regexp_rb.html | 29 +-- doc/files/lib/rir/string_rb.html | 26 +-- doc/files/lib/rir/ttagger_rb.html | 101 +------- doc/fr_class_index.html | 34 +-- doc/fr_file_index.html | 2 - doc/fr_method_index.html | 42 ++-- doc/index.html | 2 +- lib/rir/corpus.rb | 32 ++- lib/rir/document.rb | 3 + lib/rir/query.rb | 105 +++++---- lib/rir/regexp.rb | 2 + lib/rir/string.rb | 2 + lib/rir/ttagger.rb | 120 +++++----- 63 files changed, 2301 insertions(+), 669 deletions(-) create mode 100644 doc/classes/Corpus.html create mode 100644 doc/classes/Corpus.src/M000001.html create mode 100644 doc/classes/Corpus.src/M000002.html create mode 100644 doc/classes/Indri.html create mode 100644 doc/classes/Indri/IndriQuery.html create mode 100644 doc/classes/Indri/IndriQuery.src/M000020.html create mode 100644 doc/classes/Indri/IndriQuery.src/M000021.html create mode 100644 doc/classes/Indri/Parameters.html create mode 100644 doc/classes/Indri/Parameters.src/M000018.html create mode 100644 doc/classes/Indri/Parameters.src/M000019.html create mode 100644 doc/classes/Query.html create mode 100644 doc/classes/Regexp.src/M000003.html create mode 100644 doc/classes/String.src/M000013.html create mode 100644 doc/classes/String.src/M000014.html create mode 100644 doc/classes/TreeTagger.html create mode 100644 doc/classes/TreeTagger/Chunk.html create mode 100644 doc/classes/TreeTagger/Chunk.src/M000017.html create mode 100644 doc/classes/TreeTagger/TaggerChunker.html create mode 100644 doc/classes/TreeTagger/TaggerChunker.src/M000015.html create mode 100644 doc/classes/TreeTagger/TaggerChunker.src/M000016.html create mode 100644 doc/classes/TreeTagger/TaggerChunkerEnglish.html create mode 100644 doc/classes/TreeTagger/TaggerChunkerFrench.html create mode 100644 doc/classes/TreeTagger/TaggerChunkerGerman.html diff --git a/doc/classes/Corpus.html b/doc/classes/Corpus.html new file mode 100644 index 0000000..cacc4da --- /dev/null +++ b/doc/classes/Corpus.html @@ -0,0 +1,200 @@ + + + + Class: Corpus [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + + + + + + +
ClassCorpus
In: + + + + + lib/rir/corpus.rb + + + + +
+ +
Parent: + + Object + +
+
+ + +
+ +
+ +
+ + +
+

Methods

+ +
+ + files   + + new   + +
+
+ +
+ + + +
+ + + +
+

Attributes

+ +
+ + + + + + + + + + +
path [RW] 
+
+
+ + + + +
+ +

Public Class methods

+ + +
+ + + + +
+ +
+
+ + +

Public Instance methods

+ + +
+ + + + +
+ +

+Recursively outputs all files in self.path. WARNING ! This +function may take a lot of time if many files are in subdirectories. +

+
+  c = Corpus.new "my/path"
+  c.files                  # => ["README.txt", "lib/code.rb"]
+
+ +
+
+ + + +
+ + + + +
+ +
+

[Validate]

+
+ + + diff --git a/doc/classes/Corpus.src/M000001.html b/doc/classes/Corpus.src/M000001.html new file mode 100644 index 0000000..dc4ae58 --- /dev/null +++ b/doc/classes/Corpus.src/M000001.html @@ -0,0 +1,15 @@ + + + + new (Corpus) + + + + +
# File lib/rir/corpus.rb, line 25
+  def initialize(path)
+    @path = path.chomp "/"
+  end
+ + diff --git a/doc/classes/Corpus.src/M000002.html b/doc/classes/Corpus.src/M000002.html new file mode 100644 index 0000000..0a9287f --- /dev/null +++ b/doc/classes/Corpus.src/M000002.html @@ -0,0 +1,15 @@ + + + + files (Corpus) + + + + +
# File lib/rir/corpus.rb, line 35
+  def files
+    Dir["#{@path}/**/*.*"]
+  end
+ + diff --git a/doc/classes/Indri.html b/doc/classes/Indri.html new file mode 100644 index 0000000..c875dd7 --- /dev/null +++ b/doc/classes/Indri.html @@ -0,0 +1,109 @@ + + + + Module: Indri [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + +
ModuleIndri
In: + + + + + lib/rir/query.rb + + + + +
+ +
+
+ + +
+ +
+ +
+ + +
+ + + +
+ +
+

Classes and Modules

+ + Class Indri::IndriQuery
+Class Indri::Parameters
+ +
+ + + + + + + + + +
+ +
+

[Validate]

+
+ + + diff --git a/doc/classes/Indri/IndriQuery.html b/doc/classes/Indri/IndriQuery.html new file mode 100644 index 0000000..698b8a1 --- /dev/null +++ b/doc/classes/Indri/IndriQuery.html @@ -0,0 +1,219 @@ + + + + Class: Indri::IndriQuery [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + + + + + + +
ClassIndri::IndriQuery
In: + + + + + lib/rir/query.rb + + + + +
+ +
Parent: + + + + Query + + + +
+
+ + +
+ +
+ +
+ + +
+

Methods

+ +
+ + new   + + to_s   + +
+
+ +
+ + + +
+ + + +
+

Attributes

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
id [RW] 
params [RW] 
query [RW] 
rule [RW] 
+
+
+ + + + +
+ +

Public Class methods

+ + +
+ + + + +
+ +
+
+ + +

Public Instance methods

+ + +
+ + + + +
+ +
+
+ + + +
+ + + + +
+ +
+

[Validate]

+
+ + + diff --git a/doc/classes/Indri/IndriQuery.src/M000020.html b/doc/classes/Indri/IndriQuery.src/M000020.html new file mode 100644 index 0000000..a66ba37 --- /dev/null +++ b/doc/classes/Indri/IndriQuery.src/M000020.html @@ -0,0 +1,22 @@ + + + + new (Indri::IndriQuery) + + + + +
# File lib/rir/query.rb, line 62
+    def initialize(id,query,params)
+      @params = params
+      # Here we set the default retrieval model as Language Modeling
+      # with a Dirichlet smoothing at 2500.
+      # TODO: maybe a Rule class...
+      @params.rule  = 'method:dirichlet,mu:2500' if @params.rule.nil?
+
+      @id     = id
+      @query  = query
+    end
+ + diff --git a/doc/classes/Indri/IndriQuery.src/M000021.html b/doc/classes/Indri/IndriQuery.src/M000021.html new file mode 100644 index 0000000..728320c --- /dev/null +++ b/doc/classes/Indri/IndriQuery.src/M000021.html @@ -0,0 +1,22 @@ + + + + to_s (Indri::IndriQuery) + + + + +
# File lib/rir/query.rb, line 73
+    def to_s
+      h = @params.to_s
+      h += "<query>\n"
+      h += "<number>#{@id}</number>\n"
+      h += "<text>#{@query}</text>\n"
+      h += "</query>\n"
+      h += "</parameters>"
+
+      h
+    end
+ + diff --git a/doc/classes/Indri/Parameters.html b/doc/classes/Indri/Parameters.html new file mode 100644 index 0000000..e70ca27 --- /dev/null +++ b/doc/classes/Indri/Parameters.html @@ -0,0 +1,255 @@ + + + + Class: Indri::Parameters [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + + + + + + +
ClassIndri::Parameters
In: + + + + + lib/rir/query.rb + + + + +
+ +
Parent: + + Object + +
+
+ + +
+ +
+ +
+ + +
+

Methods

+ +
+ + new   + + to_s   + +
+
+ +
+ + + +
+ + + +
+

Attributes

+ +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
baseline [RW] 
count [RW] 
index_path [RW] 
memory [RW] 
offset [RW] 
print_docs [RW] 
print_query [RW] 
rule [RW] 
run_id [RW] 
+
+
+ + + + +
+ +

Public Class methods

+ + + + + +

Public Instance methods

+ + +
+ + + + +
+ +
+
+ + + +
+ + + + +
+ +
+

[Validate]

+
+ + + diff --git a/doc/classes/Indri/Parameters.src/M000018.html b/doc/classes/Indri/Parameters.src/M000018.html new file mode 100644 index 0000000..87337f4 --- /dev/null +++ b/doc/classes/Indri/Parameters.src/M000018.html @@ -0,0 +1,21 @@ + + + + new (Indri::Parameters) + + + + +
# File lib/rir/query.rb, line 30
+    def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false)
+      @index_path  = corpus
+      @memory      = mem
+      @count       = count
+      @offset      = offset
+      @run_id      = run_id
+      @print_query = print_query ? "true" : "false"
+      @print_docs  = print_docs  ? "true" : "false"
+    end
+ + diff --git a/doc/classes/Indri/Parameters.src/M000019.html b/doc/classes/Indri/Parameters.src/M000019.html new file mode 100644 index 0000000..da5c34c --- /dev/null +++ b/doc/classes/Indri/Parameters.src/M000019.html @@ -0,0 +1,29 @@ + + + + to_s (Indri::Parameters) + + + + +
# File lib/rir/query.rb, line 40
+    def to_s
+      h = "<parameters>\n"
+      h += "<memory>#{@memory}</memory>\n"
+      h += "<index>#{@index_path}</index>\n"
+      h += "<count>#{@count}</count>\n"
+      unless @baseline.nil?
+        h += "<baseline>#{@baseline}</baseline>\n" 
+      else
+        h += "<rule>#{@rule}</rule>\n"
+      end
+      h += "<queryOffset>#{@offset}</queryOffset>\n"
+      h += "<runID>#{@run_id}</runID>\n"
+      h += "<printQuery>#{@print_query}</printQuery>\n"
+      h += "<printDocuments>#{@print_docs}</printDocuments>\n"
+
+      h
+    end
+ + diff --git a/doc/classes/Query.html b/doc/classes/Query.html new file mode 100644 index 0000000..c29e471 --- /dev/null +++ b/doc/classes/Query.html @@ -0,0 +1,110 @@ + + + + Class: Query [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + + + + + + +
ClassQuery
In: + + + + + lib/rir/query.rb + + + + +
+ +
Parent: + + Object + +
+
+ + +
+ +
+ +
+ + +
+ + + +
+ + + + + + + + + +
+ +
+

[Validate]

+
+ + + diff --git a/doc/classes/RIR.html b/doc/classes/RIR.html index 0149a10..84230a7 100644 --- a/doc/classes/RIR.html +++ b/doc/classes/RIR.html @@ -53,36 +53,6 @@ - - - lib/rir/ttagger.rb - - - - -
- - - - - lib/rir/corpus.rb - - - - -
- - - - - lib/rir/query.rb - - - - -
- - lib/rir/string.rb @@ -115,99 +85,7 @@
-

-This file is a part of an Information Retrieval oriented Ruby library -

-

-Copyright (C) 2010-2011 Romain Deveaud -

-

-This program is free software: you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the Free -Software Foundation, either version 3 of the License, or (at your option) -any later version. -

-

-This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -more details. -

-

-You should have received a copy of the GNU General Public License along -with this program. If not, see <www.gnu.org/licenses/>. -

-

-This file is a part of an Information Retrieval oriented Ruby library -

-

-Copyright (C) 2010-2011 Romain Deveaud -

-

-This program is free software: you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the Free -Software Foundation, either version 3 of the License, or (at your option) -any later version. -

-

-This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -more details. -

-

-You should have received a copy of the GNU General Public License along -with this program. If not, see <www.gnu.org/licenses/>. -

-

-This file is a part of an Information Retrieval oriented Ruby library -

-

-Copyright (C) 2010-2011 Romain Deveaud -

-

-This program is free software: you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the Free -Software Foundation, either version 3 of the License, or (at your option) -any later version. -

-

-This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -more details. -

-

-You should have received a copy of the GNU General Public License along -with this program. If not, see <www.gnu.org/licenses/>. -

-

-This file is a part of an Information Retrieval oriented Ruby library -

-

-Copyright (C) 2010-2011 Romain Deveaud -

-

-This program is free software: you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the Free -Software Foundation, either version 3 of the License, or (at your option) -any later version. -

-

-This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -more details. -

-

-You should have received a copy of the GNU General Public License along -with this program. If not, see <www.gnu.org/licenses/>. -

-

+


General module for many purposes related to Information Retrieval.

@@ -225,11 +103,7 @@ General module for many purposes related to Information Retrieval.

Classes and Modules

- Module RIR::Indri
-Module RIR::TreeTagger
-Class RIR::Corpus
-Class RIR::Document
-Class RIR::Query
+ Class RIR::Document
Class RIR::WebDocument
Class RIR::WikipediaPage
diff --git a/doc/classes/RIR/Document.src/M000022.html b/doc/classes/RIR/Document.src/M000022.html index 72c51f5..d476fd8 100644 --- a/doc/classes/RIR/Document.src/M000022.html +++ b/doc/classes/RIR/Document.src/M000022.html @@ -7,7 +7,7 @@ -
# File lib/rir/document.rb, line 31
+  
# File lib/rir/document.rb, line 34
     def format_words
       wo = []
 
diff --git a/doc/classes/RIR/Document.src/M000023.html b/doc/classes/RIR/Document.src/M000023.html
index 6257629..3416fef 100644
--- a/doc/classes/RIR/Document.src/M000023.html
+++ b/doc/classes/RIR/Document.src/M000023.html
@@ -7,7 +7,7 @@
   
 
 
-  
# File lib/rir/document.rb, line 46
+  
# File lib/rir/document.rb, line 49
     def ngrams(n)
       window       = []
       ngrams_array = []
diff --git a/doc/classes/RIR/Document.src/M000024.html b/doc/classes/RIR/Document.src/M000024.html
index e8ddeec..bb859fd 100644
--- a/doc/classes/RIR/Document.src/M000024.html
+++ b/doc/classes/RIR/Document.src/M000024.html
@@ -7,7 +7,7 @@
   
 
 
-  
# File lib/rir/document.rb, line 64
+  
# File lib/rir/document.rb, line 67
     def count_words
       counts = Hash.new { |h,k| h[k] = 0 }
       @words.each { |w| counts[w] += 1 }
diff --git a/doc/classes/RIR/Document.src/M000025.html b/doc/classes/RIR/Document.src/M000025.html
index 50f6db7..9ccf905 100644
--- a/doc/classes/RIR/Document.src/M000025.html
+++ b/doc/classes/RIR/Document.src/M000025.html
@@ -7,7 +7,7 @@
   
 
 
-  
# File lib/rir/document.rb, line 78
+  
# File lib/rir/document.rb, line 81
     def entropy(s)
       en = 0.0
       counts = self.count_words
diff --git a/doc/classes/RIR/Document.src/M000026.html b/doc/classes/RIR/Document.src/M000026.html
index eb4436b..0b57bd6 100644
--- a/doc/classes/RIR/Document.src/M000026.html
+++ b/doc/classes/RIR/Document.src/M000026.html
@@ -7,7 +7,7 @@
   
 
 
-  
# File lib/rir/document.rb, line 94
+  
# File lib/rir/document.rb, line 97
     def tf(s)
       self.count_words[s].to_f/@words.size.to_f
     end
diff --git a/doc/classes/RIR/Document.src/M000027.html b/doc/classes/RIR/Document.src/M000027.html index 1ef96d5..6b8e2c2 100644 --- a/doc/classes/RIR/Document.src/M000027.html +++ b/doc/classes/RIR/Document.src/M000027.html @@ -7,7 +7,7 @@ -
# File lib/rir/document.rb, line 99
+  
# File lib/rir/document.rb, line 102
     def initialize(content)
       @doc_content = content
       @words = format_words
diff --git a/doc/classes/RIR/WebDocument.src/M000028.html b/doc/classes/RIR/WebDocument.src/M000028.html
index e3931e7..cf2b1a5 100644
--- a/doc/classes/RIR/WebDocument.src/M000028.html
+++ b/doc/classes/RIR/WebDocument.src/M000028.html
@@ -7,7 +7,7 @@
   
 
 
-  
# File lib/rir/document.rb, line 112
+  
# File lib/rir/document.rb, line 115
     def self.get_content(url)
       require 'net/http'
       Net::HTTP.get(URI.parse(url))
diff --git a/doc/classes/RIR/WebDocument.src/M000029.html b/doc/classes/RIR/WebDocument.src/M000029.html
index dd6da25..6750287 100644
--- a/doc/classes/RIR/WebDocument.src/M000029.html
+++ b/doc/classes/RIR/WebDocument.src/M000029.html
@@ -7,7 +7,7 @@
   
 
 
-  
# File lib/rir/document.rb, line 119
+  
# File lib/rir/document.rb, line 122
     def initialize(url)
       @url = url
       super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags
diff --git a/doc/classes/RIR/WikipediaPage.src/M000030.html b/doc/classes/RIR/WikipediaPage.src/M000030.html
index eb3518e..3318c27 100644
--- a/doc/classes/RIR/WikipediaPage.src/M000030.html
+++ b/doc/classes/RIR/WikipediaPage.src/M000030.html
@@ -7,7 +7,7 @@
   
 
 
-  
# File lib/rir/document.rb, line 132
+  
# File lib/rir/document.rb, line 135
     def self.search_wikipedia_titles(name)
       raise ArgumentError, "Bad encoding", name unless name.isutf8
 
diff --git a/doc/classes/RIR/WikipediaPage.src/M000031.html b/doc/classes/RIR/WikipediaPage.src/M000031.html
index 0b6b98b..01ebe85 100644
--- a/doc/classes/RIR/WikipediaPage.src/M000031.html
+++ b/doc/classes/RIR/WikipediaPage.src/M000031.html
@@ -7,7 +7,7 @@
   
 
 
-  
# File lib/rir/document.rb, line 140
+  
# File lib/rir/document.rb, line 143
     def self.get_url(name)
       raise ArgumentError, "Bad encoding", name unless name.isutf8
 
diff --git a/doc/classes/RIR/WikipediaPage.src/M000032.html b/doc/classes/RIR/WikipediaPage.src/M000032.html
index d93d8db..41f155c 100644
--- a/doc/classes/RIR/WikipediaPage.src/M000032.html
+++ b/doc/classes/RIR/WikipediaPage.src/M000032.html
@@ -7,7 +7,7 @@
   
 
 
-  
# File lib/rir/document.rb, line 148
+  
# File lib/rir/document.rb, line 151
     def self.search_homepage(name)
       title = WikipediaPage.search_wikipedia_titles name
 
diff --git a/doc/classes/Regexp.html b/doc/classes/Regexp.html
index d23a050..03160e6 100644
--- a/doc/classes/Regexp.html
+++ b/doc/classes/Regexp.html
@@ -83,33 +83,6 @@
 
   
-
-

-This file is a part of an Information Retrieval oriented Ruby library -

-

-Copyright (C) 2010-2011 Romain Deveaud -

-

-This program is free software: you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the Free -Software Foundation, either version 3 of the License, or (at your option) -any later version. -

-

-This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -more details. -

-

-You should have received a copy of the GNU General Public License along -with this program. If not, see <www.gnu.org/licenses/>. -

- -
-
@@ -118,7 +91,7 @@ href="http://www.gnu.org/licenses/">www.gnu.org/licenses/>.
- negated   + negated  
@@ -139,13 +112,13 @@ href="http://www.gnu.org/licenses/">www.gnu.org/licenses/>.

Public Instance methods

-
- +
+ @@ -150,13 +150,13 @@ useful function.

Public Instance methods

-
- +
+ -
- +
+ -
- +
+ -
- +
+ -
- +
+ -
- +
+ -
- +
+ -
- +
+ -
- +
+ -
- +
+ -
- +
+
- + strip_xml_tags!() diff --git a/doc/classes/String.src/M000004.html b/doc/classes/String.src/M000004.html index 01c1839..70c504f 100644 --- a/doc/classes/String.src/M000004.html +++ b/doc/classes/String.src/M000004.html @@ -2,14 +2,14 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> - strip_xml_tags! (String) + is_stopword? (String) -
# File lib/rir/string.rb, line 91
-  def strip_xml_tags!
-    replace strip_with_pattern /<\/?[^>]*>/
+  
# File lib/rir/string.rb, line 78
+  def is_stopword?
+    Stoplist.include?(self.downcase)
   end
diff --git a/doc/classes/String.src/M000005.html b/doc/classes/String.src/M000005.html index 2d020b7..9073156 100644 --- a/doc/classes/String.src/M000005.html +++ b/doc/classes/String.src/M000005.html @@ -2,14 +2,14 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> - strip_xml_tags (String) + remove_special_characters (String) -
# File lib/rir/string.rb, line 100
-  def strip_xml_tags
-    dup.strip_xml_tags!
+  
# File lib/rir/string.rb, line 84
+  def remove_special_characters
+    self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ')
   end
diff --git a/doc/classes/String.src/M000006.html b/doc/classes/String.src/M000006.html index 1f77395..0d97963 100644 --- a/doc/classes/String.src/M000006.html +++ b/doc/classes/String.src/M000006.html @@ -2,14 +2,14 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> - strip_javascripts! (String) + strip_xml_tags! (String) -
# File lib/rir/string.rb, line 114
-  def strip_javascripts!
-    replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m 
+  
# File lib/rir/string.rb, line 93
+  def strip_xml_tags!
+    replace strip_with_pattern /<\/?[^>]*>/
   end
diff --git a/doc/classes/String.src/M000007.html b/doc/classes/String.src/M000007.html index 8a73177..00efa8f 100644 --- a/doc/classes/String.src/M000007.html +++ b/doc/classes/String.src/M000007.html @@ -2,14 +2,14 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> - strip_javascripts (String) + strip_xml_tags (String) -
# File lib/rir/string.rb, line 127
-  def strip_javascripts
-    dup.strip_javascripts!
+  
# File lib/rir/string.rb, line 102
+  def strip_xml_tags
+    dup.strip_xml_tags!
   end
diff --git a/doc/classes/String.src/M000008.html b/doc/classes/String.src/M000008.html index 49c5a94..93970bf 100644 --- a/doc/classes/String.src/M000008.html +++ b/doc/classes/String.src/M000008.html @@ -2,15 +2,14 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> - strip_stylesheets! (String) + strip_javascripts! (String) -
# File lib/rir/string.rb, line 131
-  def strip_stylesheets!
-  # TODO: rewamp. dunno what is it.
-    replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m 
+  
# File lib/rir/string.rb, line 116
+  def strip_javascripts!
+    replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m 
   end
diff --git a/doc/classes/String.src/M000009.html b/doc/classes/String.src/M000009.html index a10b5bd..b143c5a 100644 --- a/doc/classes/String.src/M000009.html +++ b/doc/classes/String.src/M000009.html @@ -2,14 +2,14 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> - strip_stylesheets (String) + strip_javascripts (String) -
# File lib/rir/string.rb, line 136
-  def strip_stylesheets
-    dup.strip_stylesheets!
+  
# File lib/rir/string.rb, line 129
+  def strip_javascripts
+    dup.strip_javascripts!
   end
diff --git a/doc/classes/String.src/M000010.html b/doc/classes/String.src/M000010.html index 37f6f1f..f26b6c6 100644 --- a/doc/classes/String.src/M000010.html +++ b/doc/classes/String.src/M000010.html @@ -2,14 +2,15 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> - strip_punctuation! (String) + strip_stylesheets! (String) -
# File lib/rir/string.rb, line 145
-  def strip_punctuation!
-    replace strip_with_pattern /[^a-zA-Z0-9\-\s]/
+  
# File lib/rir/string.rb, line 133
+  def strip_stylesheets!
+  # TODO: rewamp. dunno what is it.
+    replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m 
   end
diff --git a/doc/classes/String.src/M000011.html b/doc/classes/String.src/M000011.html index 36b9164..00ac846 100644 --- a/doc/classes/String.src/M000011.html +++ b/doc/classes/String.src/M000011.html @@ -2,14 +2,14 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> - strip_punctuation (String) + strip_stylesheets (String) -
# File lib/rir/string.rb, line 153
-  def strip_punctuation
-    dup.strip_punctuation!
+  
# File lib/rir/string.rb, line 138
+  def strip_stylesheets
+    dup.strip_stylesheets!
   end
diff --git a/doc/classes/String.src/M000012.html b/doc/classes/String.src/M000012.html index c920941..a6e3495 100644 --- a/doc/classes/String.src/M000012.html +++ b/doc/classes/String.src/M000012.html @@ -2,14 +2,14 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> - extract_xmltags_values (String) + strip_punctuation! (String) -
# File lib/rir/string.rb, line 161
-  def extract_xmltags_values(tag_name)
-    self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten
+  
# File lib/rir/string.rb, line 147
+  def strip_punctuation!
+    replace strip_with_pattern /[^a-zA-Z0-9\-\s]/
   end
diff --git a/doc/classes/String.src/M000013.html b/doc/classes/String.src/M000013.html new file mode 100644 index 0000000..aa192ed --- /dev/null +++ b/doc/classes/String.src/M000013.html @@ -0,0 +1,15 @@ + + + + strip_punctuation (String) + + + + +
# File lib/rir/string.rb, line 155
+  def strip_punctuation
+    dup.strip_punctuation!
+  end
+ + diff --git a/doc/classes/String.src/M000014.html b/doc/classes/String.src/M000014.html new file mode 100644 index 0000000..d6e6648 --- /dev/null +++ b/doc/classes/String.src/M000014.html @@ -0,0 +1,15 @@ + + + + extract_xmltags_values (String) + + + + +
# File lib/rir/string.rb, line 163
+  def extract_xmltags_values(tag_name)
+    self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten
+  end
+ + diff --git a/doc/classes/TreeTagger.html b/doc/classes/TreeTagger.html new file mode 100644 index 0000000..7ca1358 --- /dev/null +++ b/doc/classes/TreeTagger.html @@ -0,0 +1,123 @@ + + + + Module: TreeTagger [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + +
ModuleTreeTagger
In: + + + + + lib/rir/ttagger.rb + + + + +
+ +
+
+ + +
+ +
+ +
+

+TreeTagger-related stuff module. +

+

+See www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/DecisionTreeTagger.html +

+ +
+ +
+ + +
+ + + +
+ + + + + diff --git a/doc/classes/TreeTagger/Chunk.html b/doc/classes/TreeTagger/Chunk.html new file mode 100644 index 0000000..4aec6b0 --- /dev/null +++ b/doc/classes/TreeTagger/Chunk.html @@ -0,0 +1,195 @@ + + + + Class: TreeTagger::Chunk [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + + + + + + +
ClassTreeTagger::Chunk
In: + + + + + lib/rir/ttagger.rb + + + + +
+ +
Parent: + + Object + +
+
+ + +
+ +
+ +
+

+Represents a Chunk extracted when parsing a TaggerChunker file. +

+ +
+ +
+ + +
+

Methods

+ +
+ + new   + +
+
+ +
+ + + +
+ + + +
+

Attributes

+ +
+ + + + + + + + + + + + + + + + + + +
tag [R] 
words [R] 
+
+
+ + + + +
+ +

Public Class methods

+ + +
+ + + + +
+ +

+Creates a Chunk. +

+ + +
+
+ + + +
+ + + + +
+ + + + + diff --git a/doc/classes/TreeTagger/Chunk.src/M000017.html b/doc/classes/TreeTagger/Chunk.src/M000017.html new file mode 100644 index 0000000..739251a --- /dev/null +++ b/doc/classes/TreeTagger/Chunk.src/M000017.html @@ -0,0 +1,16 @@ + + + + new (TreeTagger::Chunk) + + + + +
# File lib/rir/ttagger.rb, line 89
+    def initialize str,tag
+      @words = str.split
+      @tag   = tag[1..-2]
+    end
+ + diff --git a/doc/classes/TreeTagger/TaggerChunker.html b/doc/classes/TreeTagger/TaggerChunker.html new file mode 100644 index 0000000..2e7f693 --- /dev/null +++ b/doc/classes/TreeTagger/TaggerChunker.html @@ -0,0 +1,216 @@ + + + + Class: TreeTagger::TaggerChunker [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + + + + + + +
ClassTreeTagger::TaggerChunker
In: + + + + + lib/rir/ttagger.rb + + + + +
+ +
Parent: + + Object + +
+
+ + +
+ +
+ +
+

+This class handles generic parsing of tagger-chunker outputs. +

+ +
+ +
+ + +
+

Methods

+ +
+ + new   + + parse   + +
+
+ +
+ + + +
+ + + +
+

Attributes

+ +
+ + + + + + + + + + + + + + + + + + +
chunks [R] 
file [R] 
+
+
+ + + + +
+ +

Public Class methods

+ + +
+ + + + +
+ +

+Initializes parsing. chunk_file is the output of +tagger-chunker- and must be a valid path to the file. +

+
+  TaggerChunker.new("ttout/2010020") #=> #<RIR::TreeTagger::TaggerChunker:0x92fd088 @chunks=[#<RIR::TreeTagger::Chunk:0x8ec5a10 @words=["robert", "schumann"], @tag="NC">, ...] ...>
+
+ +
+
+ + +
+ + + + +
+ +

+Parses a tagger-chunker output and returns an Array of Chunk. +

+ +
+
+ + + +
+ + + + +
+ + + + + diff --git a/doc/classes/TreeTagger/TaggerChunker.src/M000015.html b/doc/classes/TreeTagger/TaggerChunker.src/M000015.html new file mode 100644 index 0000000..a3a3546 --- /dev/null +++ b/doc/classes/TreeTagger/TaggerChunker.src/M000015.html @@ -0,0 +1,39 @@ + + + + parse (TreeTagger::TaggerChunker) + + + + +
# File lib/rir/ttagger.rb, line 34
+    def self.parse chunk_lines
+      open = false
+      tag  = nil
+
+      chunks = []
+      words  = []
+
+      chunk_lines.each do |l|
+        l.chomp!
+        if l =~ /^<\w+>$/
+          open = true
+          tag  = l
+        elsif l =~ /^<\/\w+>$/
+          if !words.empty? && open && l == tag.sub(/</, '</')
+            open = false
+            chunks.push Chunk.new(words.join(" "), tag) 
+            words.clear
+          else
+            next
+          end
+        else
+          words.push(l.split.first)
+        end
+      end
+
+      chunks
+    end
+ + diff --git a/doc/classes/TreeTagger/TaggerChunker.src/M000016.html b/doc/classes/TreeTagger/TaggerChunker.src/M000016.html new file mode 100644 index 0000000..6652c5c --- /dev/null +++ b/doc/classes/TreeTagger/TaggerChunker.src/M000016.html @@ -0,0 +1,15 @@ + + + + new (TreeTagger::TaggerChunker) + + + + +
# File lib/rir/ttagger.rb, line 66
+    def initialize chunk_file
+      @chunks = TaggerChunker.parse File.open(chunk_file).readlines
+    end
+ + diff --git a/doc/classes/TreeTagger/TaggerChunkerEnglish.html b/doc/classes/TreeTagger/TaggerChunkerEnglish.html new file mode 100644 index 0000000..45936ae --- /dev/null +++ b/doc/classes/TreeTagger/TaggerChunkerEnglish.html @@ -0,0 +1,114 @@ + + + + Class: TreeTagger::TaggerChunkerEnglish [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + + + + + + +
ClassTreeTagger::TaggerChunkerEnglish
In: + + + + + lib/rir/ttagger.rb + + + + +
+ +
Parent: + + + + TreeTagger::TaggerChunker + + + +
+
+ + +
+ +
+ +
+ + +
+ + + +
+ + + + + + + + + +
+ + + + + diff --git a/doc/classes/TreeTagger/TaggerChunkerFrench.html b/doc/classes/TreeTagger/TaggerChunkerFrench.html new file mode 100644 index 0000000..8309193 --- /dev/null +++ b/doc/classes/TreeTagger/TaggerChunkerFrench.html @@ -0,0 +1,114 @@ + + + + Class: TreeTagger::TaggerChunkerFrench [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + + + + + + +
ClassTreeTagger::TaggerChunkerFrench
In: + + + + + lib/rir/ttagger.rb + + + + +
+ +
Parent: + + + + TreeTagger::TaggerChunker + + + +
+
+ + +
+ +
+ +
+ + +
+ + + +
+ + + + + + + + + +
+ + + + + diff --git a/doc/classes/TreeTagger/TaggerChunkerGerman.html b/doc/classes/TreeTagger/TaggerChunkerGerman.html new file mode 100644 index 0000000..e9a998a --- /dev/null +++ b/doc/classes/TreeTagger/TaggerChunkerGerman.html @@ -0,0 +1,114 @@ + + + + Class: TreeTagger::TaggerChunkerGerman [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + + + + + + +
ClassTreeTagger::TaggerChunkerGerman
In: + + + + + lib/rir/ttagger.rb + + + + +
+ +
Parent: + + + + TreeTagger::TaggerChunker + + + +
+
+ + +
+ +
+ +
+ + +
+ + + +
+ + + + + + + + + +
+ + + + + diff --git a/doc/created.rid b/doc/created.rid index 6e8bb1d..896d22c 100644 --- a/doc/created.rid +++ b/doc/created.rid @@ -1 +1 @@ -Thu, 25 Nov 2010 17:10:04 +0100 +Thu, 25 Nov 2010 17:21:51 +0100 diff --git a/doc/files/lib/rir/corpus_rb.html b/doc/files/lib/rir/corpus_rb.html index 95ef563..a211e38 100644 --- a/doc/files/lib/rir/corpus_rb.html +++ b/doc/files/lib/rir/corpus_rb.html @@ -53,7 +53,7 @@ Last Update: - 2010-11-23 18:20:24 +0100 + 2010-11-25 17:20:52 +0100
@@ -63,33 +63,6 @@
-
-

-This file is a part of an Information Retrieval oriented Ruby library -

-

-Copyright (C) 2010-2011 Romain Deveaud -

-

-This program is free software: you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the Free -Software Foundation, either version 3 of the License, or (at your option) -any later version. -

-

-This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -more details. -

-

-You should have received a copy of the GNU General Public License along -with this program. If not, see <www.gnu.org/licenses/>. -

- -
-
diff --git a/doc/files/lib/rir/document_rb.html b/doc/files/lib/rir/document_rb.html index d5a6ac5..3dd8c6c 100644 --- a/doc/files/lib/rir/document_rb.html +++ b/doc/files/lib/rir/document_rb.html @@ -53,7 +53,7 @@ Last Update: - 2010-11-25 16:04:20 +0100 + 2010-11-25 17:20:25 +0100
@@ -63,33 +63,6 @@
-
-

-This file is a part of an Information Retrieval oriented Ruby library -

-

-Copyright (C) 2010-2011 Romain Deveaud -

-

-This program is free software: you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the Free -Software Foundation, either version 3 of the License, or (at your option) -any later version. -

-

-This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -more details. -

-

-You should have received a copy of the GNU General Public License along -with this program. If not, see <www.gnu.org/licenses/>. -

- -
-

Required files

diff --git a/doc/files/lib/rir/query_rb.html b/doc/files/lib/rir/query_rb.html index a2db500..7c34092 100644 --- a/doc/files/lib/rir/query_rb.html +++ b/doc/files/lib/rir/query_rb.html @@ -53,7 +53,7 @@ Last Update: - 2010-11-25 13:25:18 +0100 + 2010-11-25 17:21:14 +0100
@@ -63,33 +63,6 @@
-
-

-This file is a part of an Information Retrieval oriented Ruby library -

-

-Copyright (C) 2010-2011 Romain Deveaud -

-

-This program is free software: you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the Free -Software Foundation, either version 3 of the License, or (at your option) -any later version. -

-

-This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -more details. -

-

-You should have received a copy of the GNU General Public License along -with this program. If not, see <www.gnu.org/licenses/>. -

- -
-
diff --git a/doc/files/lib/rir/regexp_rb.html b/doc/files/lib/rir/regexp_rb.html index b1693a7..a8a4bf1 100644 --- a/doc/files/lib/rir/regexp_rb.html +++ b/doc/files/lib/rir/regexp_rb.html @@ -53,7 +53,7 @@ Last Update: - 2010-11-19 11:27:06 +0100 + 2010-11-25 17:19:39 +0100
@@ -63,33 +63,6 @@
-
-

-This file is a part of an Information Retrieval oriented Ruby library -

-

-Copyright (C) 2010-2011 Romain Deveaud -

-

-This program is free software: you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the Free -Software Foundation, either version 3 of the License, or (at your option) -any later version. -

-

-This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -more details. -

-

-You should have received a copy of the GNU General Public License along -with this program. If not, see <www.gnu.org/licenses/>. -

- -
-
diff --git a/doc/files/lib/rir/string_rb.html b/doc/files/lib/rir/string_rb.html index 73f0e29..1d78744 100644 --- a/doc/files/lib/rir/string_rb.html +++ b/doc/files/lib/rir/string_rb.html @@ -53,7 +53,7 @@ Last Update: - 2010-11-23 18:20:41 +0100 + 2010-11-25 17:20:14 +0100
@@ -64,28 +64,8 @@
-

-This file is a part of an Information Retrieval oriented Ruby library -

-

-Copyright (C) 2010-2011 Romain Deveaud -

-

-This program is free software: you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the Free -Software Foundation, either version 3 of the License, or (at your option) -any later version. -

-

-This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -more details. -

-

-You should have received a copy of the GNU General Public License along -with this program. If not, see <www.gnu.org/licenses/>. +


+General module for many purposes related to Information Retrieval.

diff --git a/doc/files/lib/rir/ttagger_rb.html b/doc/files/lib/rir/ttagger_rb.html index 7358f13..8ba5cc5 100644 --- a/doc/files/lib/rir/ttagger_rb.html +++ b/doc/files/lib/rir/ttagger_rb.html @@ -53,7 +53,7 @@ Last Update: - 2010-11-25 17:01:46 +0100 + 2010-11-25 17:21:44 +0100
@@ -63,105 +63,6 @@
-
-

-This file is a part of an Information Retrieval oriented Ruby library -

-

-Copyright (C) 2010-2011 Romain Deveaud -

-

-This program is free software: you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the Free -Software Foundation, either version 3 of the License, or (at your option) -any later version. -

-

-This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -more details. -

-

-You should have received a copy of the GNU General Public License along -with this program. If not, see <www.gnu.org/licenses/>. -

-

-This file is a part of an Information Retrieval oriented Ruby library -

-

-Copyright (C) 2010-2011 Romain Deveaud -

-

-This program is free software: you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the Free -Software Foundation, either version 3 of the License, or (at your option) -any later version. -

-

-This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -more details. -

-

-You should have received a copy of the GNU General Public License along -with this program. If not, see <www.gnu.org/licenses/>. -

-

-This file is a part of an Information Retrieval oriented Ruby library -

-

-Copyright (C) 2010-2011 Romain Deveaud -

-

-This program is free software: you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the Free -Software Foundation, either version 3 of the License, or (at your option) -any later version. -

-

-This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -more details. -

-

-You should have received a copy of the GNU General Public License along -with this program. If not, see <www.gnu.org/licenses/>. -

-

-This file is a part of an Information Retrieval oriented Ruby library -

-

-Copyright (C) 2010-2011 Romain Deveaud -

-

-This program is free software: you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the Free -Software Foundation, either version 3 of the License, or (at your option) -any later version. -

-

-This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -more details. -

-

-You should have received a copy of the GNU General Public License along -with this program. If not, see <www.gnu.org/licenses/>. -

-

-General module for many purposes related to Information Retrieval. -

- -
-
diff --git a/doc/fr_class_index.html b/doc/fr_class_index.html index 0773723..43770f4 100644 --- a/doc/fr_class_index.html +++ b/doc/fr_class_index.html @@ -17,39 +17,39 @@

Classes

diff --git a/doc/fr_file_index.html b/doc/fr_file_index.html index 87c39ae..96d6676 100644 --- a/doc/fr_file_index.html +++ b/doc/fr_file_index.html @@ -17,8 +17,6 @@

Files

- lib/rir.rb
- lib/rir/corpus.rb
lib/rir/document.rb
diff --git a/doc/fr_method_index.html b/doc/fr_method_index.html index 4b02dfc..d9ad3c7 100644 --- a/doc/fr_method_index.html +++ b/doc/fr_method_index.html @@ -21,9 +21,9 @@ entropy (RIR::Document)
- extract_xmltags_values (String)
+ extract_xmltags_values (String)
- files (RIR::Corpus)
+ files (Corpus)
format_words (RIR::Document)
@@ -31,55 +31,55 @@ get_url (RIR::WikipediaPage)
- is_stopword? (String)
+ is_stopword? (String)
- negated (Regexp)
+ negated (Regexp)
new (RIR::Document)
new (RIR::WebDocument)
- new (RIR::Indri::IndriQuery)
+ new (Indri::IndriQuery)
- new (RIR::Corpus)
+ new (Indri::Parameters)
- new (RIR::TreeTagger::TaggerChunker)
+ new (Corpus)
- new (RIR::Indri::Parameters)
+ new (TreeTagger::TaggerChunker)
- new (RIR::TreeTagger::Chunk)
+ new (TreeTagger::Chunk)
ngrams (RIR::Document)
- parse (RIR::TreeTagger::TaggerChunker)
+ parse (TreeTagger::TaggerChunker)
- remove_special_characters (String)
+ remove_special_characters (String)
search_homepage (RIR::WikipediaPage)
search_wikipedia_titles (RIR::WikipediaPage)
- strip_javascripts (String)
+ strip_javascripts (String)
- strip_javascripts! (String)
+ strip_javascripts! (String)
- strip_punctuation (String)
+ strip_punctuation (String)
- strip_punctuation! (String)
+ strip_punctuation! (String)
- strip_stylesheets (String)
+ strip_stylesheets (String)
- strip_stylesheets! (String)
+ strip_stylesheets! (String)
- strip_xml_tags (String)
+ strip_xml_tags (String)
- strip_xml_tags! (String)
+ strip_xml_tags! (String)
tf (RIR::Document)
- to_s (RIR::Indri::IndriQuery)
+ to_s (Indri::Parameters)
- to_s (RIR::Indri::Parameters)
+ to_s (Indri::IndriQuery)
diff --git a/doc/index.html b/doc/index.html index 4b44566..cb3feae 100644 --- a/doc/index.html +++ b/doc/index.html @@ -16,6 +16,6 @@ - + diff --git a/lib/rir/corpus.rb b/lib/rir/corpus.rb index 8428932..21555c0 100644 --- a/lib/rir/corpus.rb +++ b/lib/rir/corpus.rb @@ -1,5 +1,6 @@ #!/usr/bin/env ruby +#-- # This file is a part of an Information Retrieval oriented Ruby library # # Copyright (C) 2010-2011 Romain Deveaud @@ -16,25 +17,22 @@ # # You should have received a copy of the GNU General Public License # along with this program. If not, see . +#++ -module RIR +class Corpus + attr_accessor :path - class Corpus - attr_accessor :path - - def initialize(path) - @path = path.chomp "/" - end - - # Recursively outputs all files in +self.path+. - # WARNING ! This function may take a lot of time if many - # files are in subdirectories. - # - # c = Corpus.new "my/path" - # c.files # => ["README.txt", "lib/code.rb"] - def files - Dir["#{@path}/**/*.*"] - end + def initialize(path) + @path = path.chomp "/" end + # Recursively outputs all files in +self.path+. + # WARNING ! This function may take a lot of time if many + # files are in subdirectories. + # + # c = Corpus.new "my/path" + # c.files # => ["README.txt", "lib/code.rb"] + def files + Dir["#{@path}/**/*.*"] + end end diff --git a/lib/rir/document.rb b/lib/rir/document.rb index e5f69aa..2ed0a59 100644 --- a/lib/rir/document.rb +++ b/lib/rir/document.rb @@ -1,5 +1,6 @@ #!/usr/bin/env ruby +#-- # This file is a part of an Information Retrieval oriented Ruby library # # Copyright (C) 2010-2011 Romain Deveaud @@ -16,6 +17,8 @@ # # You should have received a copy of the GNU General Public License # along with this program. If not, see . +#++ + # General module for many purposes related to Information Retrieval. module RIR diff --git a/lib/rir/query.rb b/lib/rir/query.rb index dbff657..310d808 100644 --- a/lib/rir/query.rb +++ b/lib/rir/query.rb @@ -1,5 +1,6 @@ #!/usr/bin/env ruby +#-- # This file is a part of an Information Retrieval oriented Ruby library # # Copyright (C) 2010-2011 Romain Deveaud @@ -16,71 +17,69 @@ # # You should have received a copy of the GNU General Public License # along with this program. If not, see . +#++ -module RIR +class Query +end - class Query - end +module Indri - module Indri + class Parameters + attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline - class Parameters - attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline + def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false) + @index_path = corpus + @memory = mem + @count = count + @offset = offset + @run_id = run_id + @print_query = print_query ? "true" : "false" + @print_docs = print_docs ? "true" : "false" + end - def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false) - @index_path = corpus - @memory = mem - @count = count - @offset = offset - @run_id = run_id - @print_query = print_query ? "true" : "false" - @print_docs = print_docs ? "true" : "false" + def to_s + h = "\n" + h += "#{@memory}\n" + h += "#{@index_path}\n" + h += "#{@count}\n" + unless @baseline.nil? + h += "#{@baseline}\n" + else + h += "#{@rule}\n" end + h += "#{@offset}\n" + h += "#{@run_id}\n" + h += "#{@print_query}\n" + h += "#{@print_docs}\n" - def to_s - h = "\n" - h += "#{@memory}\n" - h += "#{@index_path}\n" - h += "#{@count}\n" - unless @baseline.nil? - h += "#{@baseline}\n" - else - h += "#{@rule}\n" - end - h += "#{@offset}\n" - h += "#{@run_id}\n" - h += "#{@print_query}\n" - h += "#{@print_docs}\n" - - h - end + h end - - class IndriQuery < Query - attr_accessor :id, :query, :params, :rule + end + + class IndriQuery < Query + attr_accessor :id, :query, :params, :rule - def initialize(id,query,params) - @params = params - # Here we set the default retrieval model as Language Modeling - # with a Dirichlet smoothing at 2500. - # TODO: maybe a Rule class... - @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? + def initialize(id,query,params) + @params = params + # Here we set the default retrieval model as Language Modeling + # with a Dirichlet smoothing at 2500. + # TODO: maybe a Rule class... + @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? - @id = id - @query = query - end + @id = id + @query = query + end - def to_s - h = @params.to_s - h += "\n" - h += "#{@id}\n" - h += "#{@query}\n" - h += "\n" - h += "" + def to_s + h = @params.to_s + h += "\n" + h += "#{@id}\n" + h += "#{@query}\n" + h += "\n" + h += "" - h - end + h end - end + end diff --git a/lib/rir/regexp.rb b/lib/rir/regexp.rb index dc718b9..e2284b7 100644 --- a/lib/rir/regexp.rb +++ b/lib/rir/regexp.rb @@ -1,5 +1,6 @@ #!/usr/bin/env ruby +#-- # This file is a part of an Information Retrieval oriented Ruby library # # Copyright (C) 2010-2011 Romain Deveaud @@ -16,6 +17,7 @@ # # You should have received a copy of the GNU General Public License # along with this program. If not, see . +#++ class Regexp diff --git a/lib/rir/string.rb b/lib/rir/string.rb index 6a9b843..efc06a3 100644 --- a/lib/rir/string.rb +++ b/lib/rir/string.rb @@ -1,5 +1,6 @@ #!/usr/bin/env ruby +#-- # This file is a part of an Information Retrieval oriented Ruby library # # Copyright (C) 2010-2011 Romain Deveaud @@ -16,6 +17,7 @@ # # You should have received a copy of the GNU General Public License # along with this program. If not, see . +#++ module RIR diff --git a/lib/rir/ttagger.rb b/lib/rir/ttagger.rb index e1f2bd6..0c1d01e 100644 --- a/lib/rir/ttagger.rb +++ b/lib/rir/ttagger.rb @@ -1,5 +1,6 @@ #!/usr/bin/env ruby +#-- # This file is a part of an Information Retrieval oriented Ruby library # # Copyright (C) 2010-2011 Romain Deveaud @@ -16,78 +17,79 @@ # # You should have received a copy of the GNU General Public License # along with this program. If not, see . +#++ -module RIR - - # TreeTagger-related stuff module. - # - # See http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/DecisionTreeTagger.html - module TreeTagger - - # This class handles generic parsing of tagger-chunker outputs. - class TaggerChunker - attr_reader :chunks, :file - - - # Parses a tagger-chunker output and returns an Array of Chunk. - def self.parse chunk_lines - open = false - tag = nil - - chunks = [] - words = [] - - chunk_lines.each do |l| - l.chomp! - if l =~ /^<\w+>$/ - open = true - tag = l - elsif l =~ /^<\/\w+>$/ - if !words.empty? && open && l == tag.sub(/$/ + open = true + tag = l + elsif l =~ /^<\/\w+>$/ + if !words.empty? && open && l == tag.sub(/ #, ...] ...> - def initialize chunk_file - @chunks = TaggerChunker.parse File.open(chunk_file).readlines end + chunks end - class TaggerChunkerEnglish < TaggerChunker + # Initializes parsing. +chunk_file+ is the output of +tagger-chunker-+ and must + # be a valid path to the file. + # + # TaggerChunker.new("ttout/2010020") #=> #, ...] ...> + def initialize chunk_file + @chunks = TaggerChunker.parse File.open(chunk_file).readlines end - class TaggerChunkerFrench < TaggerChunker - end + end - class TaggerChunkerGerman < TaggerChunker - end + class TaggerChunkerEnglish < TaggerChunker + end - # Represents a Chunk extracted when parsing a TaggerChunker file. - class Chunk - attr_reader :words, :tag + class TaggerChunkerFrench < TaggerChunker + end - # +str+ are whitespace-separated terms. - # +tag+ see : ftp://ftp.ims.uni-stuttgart.de/pub/corpora/chunker-tagset-english.txt - def initialize str,tag - @words = str.split - @tag = tag[1..-2] - end - end + class TaggerChunkerGerman < TaggerChunker + end + # Represents a Chunk extracted when parsing a TaggerChunker file. + class Chunk + attr_reader :words, :tag + + # Creates a Chunk. + # + # * +str+ are whitespace-separated terms. + # * +tag+ see : ftp://ftp.ims.uni-stuttgart.de/pub/corpora/chunker-tagset-english.txt + def initialize str,tag + @words = str.split + @tag = tag[1..-2] + end end + end -- 1.8.2.3