diff --git a/README.markdown b/README.markdown index e61112c..9c4bdba 100644 --- a/README.markdown +++ b/README.markdown @@ -1,4 +1,4 @@ -# Ruby Information Retrieval (rIR) +# mirimiri Copyright (C) 2010-2011 Romain Deveaud diff --git a/Rakefile b/Rakefile index c9fbb55..55ededd 100644 --- a/Rakefile +++ b/Rakefile @@ -2,7 +2,7 @@ require 'rake' require 'rake/testtask' Rake::TestTask.new(:test) do |test| - test.libs << 'lib' << 'lib/rir' << 'test' + test.libs << 'lib' << 'lib/mirimiri' << 'test' test.pattern = 'test/**/*_test.rb' test.verbose = true end diff --git a/doc/classes/Corpus.html b/doc/classes/Corpus.html index cacc4da..3026cbd 100644 --- a/doc/classes/Corpus.html +++ b/doc/classes/Corpus.html @@ -53,9 +53,9 @@ - + - lib/rir/corpus.rb + lib/mirimiri/corpus.rb diff --git a/doc/classes/Corpus.src/M000001.html b/doc/classes/Corpus.src/M000001.html index dc4ae58..a1d348c 100644 --- a/doc/classes/Corpus.src/M000001.html +++ b/doc/classes/Corpus.src/M000001.html @@ -7,7 +7,7 @@ -
# File lib/rir/corpus.rb, line 25
+  
# File lib/mirimiri/corpus.rb, line 25
   def initialize(path)
     @path = path.chomp "/"
   end
diff --git a/doc/classes/Corpus.src/M000002.html b/doc/classes/Corpus.src/M000002.html index 0a9287f..058ea50 100644 --- a/doc/classes/Corpus.src/M000002.html +++ b/doc/classes/Corpus.src/M000002.html @@ -7,7 +7,7 @@ -
# File lib/rir/corpus.rb, line 35
+  
# File lib/mirimiri/corpus.rb, line 35
   def files
     Dir["#{@path}/**/*.*"]
   end
diff --git a/doc/classes/Indri.html b/doc/classes/Indri.html index c875dd7..b84a4e3 100644 --- a/doc/classes/Indri.html +++ b/doc/classes/Indri.html @@ -53,9 +53,9 @@ - + - lib/rir/query.rb + lib/mirimiri/query.rb diff --git a/doc/classes/Indri/IndriQuery.html b/doc/classes/Indri/IndriQuery.html index 698b8a1..d9d0e48 100644 --- a/doc/classes/Indri/IndriQuery.html +++ b/doc/classes/Indri/IndriQuery.html @@ -53,9 +53,9 @@ - + - lib/rir/query.rb + lib/mirimiri/query.rb diff --git a/doc/classes/Indri/IndriQuery.src/M000020.html b/doc/classes/Indri/IndriQuery.src/M000020.html index a66ba37..5b978a4 100644 --- a/doc/classes/Indri/IndriQuery.src/M000020.html +++ b/doc/classes/Indri/IndriQuery.src/M000020.html @@ -7,7 +7,7 @@ -
# File lib/rir/query.rb, line 62
+  
# File lib/mirimiri/query.rb, line 62
     def initialize(id,query,params)
       @params = params
       # Here we set the default retrieval model as Language Modeling
diff --git a/doc/classes/Indri/IndriQuery.src/M000021.html b/doc/classes/Indri/IndriQuery.src/M000021.html
index 728320c..9965d9f 100644
--- a/doc/classes/Indri/IndriQuery.src/M000021.html
+++ b/doc/classes/Indri/IndriQuery.src/M000021.html
@@ -7,7 +7,7 @@
   
 
 
-  
# File lib/rir/query.rb, line 73
+  
# File lib/mirimiri/query.rb, line 73
     def to_s
       h = @params.to_s
       h += "<query>\n"
diff --git a/doc/classes/Indri/Parameters.html b/doc/classes/Indri/Parameters.html
index e70ca27..96c2d3c 100644
--- a/doc/classes/Indri/Parameters.html
+++ b/doc/classes/Indri/Parameters.html
@@ -53,9 +53,9 @@
             
 
 
-                
+                
 
-                lib/rir/query.rb
+                lib/mirimiri/query.rb
 
                 
 
diff --git a/doc/classes/Indri/Parameters.src/M000018.html b/doc/classes/Indri/Parameters.src/M000018.html
index 87337f4..46b5961 100644
--- a/doc/classes/Indri/Parameters.src/M000018.html
+++ b/doc/classes/Indri/Parameters.src/M000018.html
@@ -7,7 +7,7 @@
   
 
 
-  
# File lib/rir/query.rb, line 30
+  
# File lib/mirimiri/query.rb, line 30
     def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false)
       @index_path  = corpus
       @memory      = mem
diff --git a/doc/classes/Indri/Parameters.src/M000019.html b/doc/classes/Indri/Parameters.src/M000019.html
index da5c34c..dfb970a 100644
--- a/doc/classes/Indri/Parameters.src/M000019.html
+++ b/doc/classes/Indri/Parameters.src/M000019.html
@@ -7,7 +7,7 @@
   
 
 
-  
# File lib/rir/query.rb, line 40
+  
# File lib/mirimiri/query.rb, line 40
     def to_s
       h = "<parameters>\n"
       h += "<memory>#{@memory}</memory>\n"
diff --git a/doc/classes/Query.html b/doc/classes/Query.html
index c29e471..f21f292 100644
--- a/doc/classes/Query.html
+++ b/doc/classes/Query.html
@@ -53,9 +53,9 @@
             
 
 
-                
+                
 
-                lib/rir/query.rb
+                lib/mirimiri/query.rb
 
                 
 
diff --git a/doc/classes/RIR.html b/doc/classes/RIR.html
deleted file mode 100644
index 84230a7..0000000
--- a/doc/classes/RIR.html
+++ /dev/null
@@ -1,150 +0,0 @@
-
-
-
-  Module: RIR [RDoc Documentation]
-  
-  
-  
-  
-
-
-
-
-
-    
- - - - - - - - - - - -
ModuleRIR
In: - - - - - lib/rir/string.rb - - - - -
- - - - - lib/rir/document.rb - - - - -
- -
-
- - -
- -
- -
-

-General module for many purposes related to Information Retrieval. -

- -
- -
- - -
- - - -
- -
-

Classes and Modules

- - Class RIR::Document
-Class RIR::WebDocument
-Class RIR::WikipediaPage
- -
- -
-

Constants

- -
- - - - - - - - - - - - -
Stoplist=[ "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu", "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during", "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting", "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff", "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore", "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he", "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto", "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto", "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include", "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into", "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last", "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe", "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs", "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless", "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing", "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite", "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt", "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave", "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them", "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts", "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru", "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh", "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward", "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week", "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore", "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever", "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom", "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within", "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your", "yours", "yourself", "yourselves" ]  -These are the default stopwords provided by Lemur. - -
-
-
- - - - - - - - - -
- - - - - diff --git a/doc/classes/RIR/Corpus.html b/doc/classes/RIR/Corpus.html deleted file mode 100644 index 2fdf78b..0000000 --- a/doc/classes/RIR/Corpus.html +++ /dev/null @@ -1,200 +0,0 @@ - - - - Class: RIR::Corpus [RDoc Documentation] - - - - - - - - - -
- - - - - - - - - - - - - - - - -
ClassRIR::Corpus
In: - - - - - lib/rir/corpus.rb - - - - -
- -
Parent: - - Object - -
-
- - -
- -
- -
- - -
-

Methods

- -
- - files   - - new   - -
-
- -
- - - -
- - - -
-

Attributes

- -
- - - - - - - - - - -
path [RW] 
-
-
- - - - -
- -

Public Class methods

- - -
- - - - -
- -
-
- - -

Public Instance methods

- - -
- - - - -
- -

-Recursively outputs all files in self.path. WARNING ! This -function may take a lot of time if many files are in subdirectories. -

-
-  c = Corpus.new "my/path"
-  c.files                  # => ["README.txt", "lib/code.rb"]
-
- -
-
- - - -
- - - - -
- - - - - diff --git a/doc/classes/RIR/Corpus.src/M000020.html b/doc/classes/RIR/Corpus.src/M000020.html deleted file mode 100644 index cb605ef..0000000 --- a/doc/classes/RIR/Corpus.src/M000020.html +++ /dev/null @@ -1,15 +0,0 @@ - - - - new (RIR::Corpus) - - - - -
# File lib/rir/corpus.rb, line 25
-    def initialize(path)
-      @path = path.chomp "/"
-    end
- - diff --git a/doc/classes/RIR/Corpus.src/M000021.html b/doc/classes/RIR/Corpus.src/M000021.html deleted file mode 100644 index a4eb5fa..0000000 --- a/doc/classes/RIR/Corpus.src/M000021.html +++ /dev/null @@ -1,15 +0,0 @@ - - - - files (RIR::Corpus) - - - - -
# File lib/rir/corpus.rb, line 35
-    def files
-      Dir["#{@path}/**/*.*"]
-    end
- - diff --git a/doc/classes/RIR/Document.html b/doc/classes/RIR/Document.html deleted file mode 100644 index fdefaa1..0000000 --- a/doc/classes/RIR/Document.html +++ /dev/null @@ -1,342 +0,0 @@ - - - - Class: RIR::Document [RDoc Documentation] - - - - - - - - - -
- - - - - - - - - - - - - - - - -
ClassRIR::Document
In: - - - - - lib/rir/document.rb - - - - -
- -
Parent: - - Object - -
-
- - -
- -
- -
-

-A Document is a bag of words and is constructed -from a string. -

- -
- -
- - -
-

Methods

- -
- - count_words   - - entropy   - - format_words   - - new   - - ngrams   - - tf   - -
-
- -
- - - -
- - - -
-

Attributes

- -
- - - - - - - - - - - - - - - - - - -
doc_content [R] 
words [R] 
-
-
- - - - -
- -

Public Class methods

- - -
- - - - -
- -
-
- - -

Public Instance methods

- - -
- - - - -
- -

-Returns a Hash containing the words and their associated counts in the -current Document. -

-
-  count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
-
- -
-
- - -
- - - - -
- -

-Computes the entropy of a given string s inside the document. -

-

-If the string parameter is composed of many words (i.e. tokens separated by -whitespace(s)), it is considered as an ngram. -

-
-  entropy("guitar") #=> 0.00432114812727959
-  entropy("dillinger escape plan") #=> 0.265862076325102
-
- -
-
- - -
- - - - -
- -

-Returns an Array containing the n-grams (words) from the current -Document. -

-
-  ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
-
- -
-
- - -
- - - - -
- -

-Computes the term frequency of a given word s. -

-
-  tf("guitar") #=> 0.000380372765310004
-
- -
-
- - -

Protected Instance methods

- - -
- - - - -
- -

-Any non-word characters are removed from the words (see perldoc.perl.org/perlre.html -and the W special escape). -

-

-Protected function, only meant to by called at the initialization. -

- -
-
- - - -
- - - - -
- - - - - diff --git a/doc/classes/RIR/Document.src/M000022.html b/doc/classes/RIR/Document.src/M000022.html deleted file mode 100644 index d476fd8..0000000 --- a/doc/classes/RIR/Document.src/M000022.html +++ /dev/null @@ -1,23 +0,0 @@ - - - - format_words (RIR::Document) - - - - -
# File lib/rir/document.rb, line 34
-    def format_words
-      wo = []
-
-      @doc_content.split.each do |w|
-        w.split(/\W/).each do |sw| 
-          wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ 
-        end
-      end
-      
-      wo
-    end
- - diff --git a/doc/classes/RIR/Document.src/M000023.html b/doc/classes/RIR/Document.src/M000023.html deleted file mode 100644 index 3416fef..0000000 --- a/doc/classes/RIR/Document.src/M000023.html +++ /dev/null @@ -1,26 +0,0 @@ - - - - ngrams (RIR::Document) - - - - -
# File lib/rir/document.rb, line 49
-    def ngrams(n)
-      window       = []
-      ngrams_array = []
-
-      @words.each do |w|
-        window.push(w)
-        if window.size == n
-          ngrams_array.push window.join(" ")
-          window.delete_at(0)
-        end
-      end
-
-      ngrams_array.uniq
-    end
- - diff --git a/doc/classes/RIR/Document.src/M000024.html b/doc/classes/RIR/Document.src/M000024.html deleted file mode 100644 index bb859fd..0000000 --- a/doc/classes/RIR/Document.src/M000024.html +++ /dev/null @@ -1,18 +0,0 @@ - - - - count_words (RIR::Document) - - - - -
# File lib/rir/document.rb, line 67
-    def count_words
-      counts = Hash.new { |h,k| h[k] = 0 }
-      @words.each { |w| counts[w] += 1 }
-
-      counts
-    end
- - diff --git a/doc/classes/RIR/Document.src/M000025.html b/doc/classes/RIR/Document.src/M000025.html deleted file mode 100644 index 9ccf905..0000000 --- a/doc/classes/RIR/Document.src/M000025.html +++ /dev/null @@ -1,24 +0,0 @@ - - - - entropy (RIR::Document) - - - - -
# File lib/rir/document.rb, line 81
-    def entropy(s)
-      en = 0.0
-      counts = self.count_words
-
-      s.split.each do |w|
-        p_wi = counts[w].to_f/@words.count.to_f
-        en += p_wi*Math.log2(p_wi)
-      end
-
-      en *= -1
-      en
-    end
- - diff --git a/doc/classes/RIR/Document.src/M000026.html b/doc/classes/RIR/Document.src/M000026.html deleted file mode 100644 index 0b57bd6..0000000 --- a/doc/classes/RIR/Document.src/M000026.html +++ /dev/null @@ -1,15 +0,0 @@ - - - - tf (RIR::Document) - - - - -
# File lib/rir/document.rb, line 97
-    def tf(s)
-      self.count_words[s].to_f/@words.size.to_f
-    end
- - diff --git a/doc/classes/RIR/Document.src/M000027.html b/doc/classes/RIR/Document.src/M000027.html deleted file mode 100644 index 6b8e2c2..0000000 --- a/doc/classes/RIR/Document.src/M000027.html +++ /dev/null @@ -1,16 +0,0 @@ - - - - new (RIR::Document) - - - - -
# File lib/rir/document.rb, line 102
-    def initialize(content)
-      @doc_content = content
-      @words = format_words
-    end
- - diff --git a/doc/classes/RIR/Indri.html b/doc/classes/RIR/Indri.html deleted file mode 100644 index 34a1058..0000000 --- a/doc/classes/RIR/Indri.html +++ /dev/null @@ -1,109 +0,0 @@ - - - - Module: RIR::Indri [RDoc Documentation] - - - - - - - - - -
- - - - - - - - - - - -
ModuleRIR::Indri
In: - - - - - lib/rir/query.rb - - - - -
- -
-
- - -
- -
- -
- - -
- - - -
- -
-

Classes and Modules

- - Class RIR::Indri::IndriQuery
-Class RIR::Indri::Parameters
- -
- - - - - - - - - -
- - - - - diff --git a/doc/classes/RIR/Indri/IndriQuery.html b/doc/classes/RIR/Indri/IndriQuery.html deleted file mode 100644 index 02ba68c..0000000 --- a/doc/classes/RIR/Indri/IndriQuery.html +++ /dev/null @@ -1,219 +0,0 @@ - - - - Class: RIR::Indri::IndriQuery [RDoc Documentation] - - - - - - - - - -
- - - - - - - - - - - - - - - - -
ClassRIR::Indri::IndriQuery
In: - - - - - lib/rir/query.rb - - - - -
- -
Parent: - - - - RIR::Query - - - -
-
- - -
- -
- -
- - -
-

Methods

- -
- - new   - - to_s   - -
-
- -
- - - -
- - - -
-

Attributes

- -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
id [RW] 
params [RW] 
query [RW] 
rule [RW] 
-
-
- - - - -
- -

Public Class methods

- - -
- - - - -
- -
-
- - -

Public Instance methods

- - -
- - - - -
- -
-
- - - -
- - - - -
- - - - - diff --git a/doc/classes/RIR/Indri/IndriQuery.src/M000018.html b/doc/classes/RIR/Indri/IndriQuery.src/M000018.html deleted file mode 100644 index c72d135..0000000 --- a/doc/classes/RIR/Indri/IndriQuery.src/M000018.html +++ /dev/null @@ -1,22 +0,0 @@ - - - - new (RIR::Indri::IndriQuery) - - - - -
# File lib/rir/query.rb, line 62
-      def initialize(id,query,params)
-        @params = params
-        # Here we set the default retrieval model as Language Modeling
-        # with a Dirichlet smoothing at 2500.
-        # TODO: maybe a Rule class...
-        @params.rule  = 'method:dirichlet,mu:2500' if @params.rule.nil?
-
-        @id     = id
-        @query  = query
-      end
- - diff --git a/doc/classes/RIR/Indri/IndriQuery.src/M000019.html b/doc/classes/RIR/Indri/IndriQuery.src/M000019.html deleted file mode 100644 index e237f9a..0000000 --- a/doc/classes/RIR/Indri/IndriQuery.src/M000019.html +++ /dev/null @@ -1,22 +0,0 @@ - - - - to_s (RIR::Indri::IndriQuery) - - - - -
# File lib/rir/query.rb, line 73
-      def to_s
-        h = @params.to_s
-        h += "<query>\n"
-        h += "<number>#{@id}</number>\n"
-        h += "<text>#{@query}</text>\n"
-        h += "</query>\n"
-        h += "</parameters>"
-
-        h
-      end
- - diff --git a/doc/classes/RIR/Indri/Parameters.html b/doc/classes/RIR/Indri/Parameters.html deleted file mode 100644 index 79d6734..0000000 --- a/doc/classes/RIR/Indri/Parameters.html +++ /dev/null @@ -1,255 +0,0 @@ - - - - Class: RIR::Indri::Parameters [RDoc Documentation] - - - - - - - - - -
- - - - - - - - - - - - - - - - -
ClassRIR::Indri::Parameters
In: - - - - - lib/rir/query.rb - - - - -
- -
Parent: - - Object - -
-
- - -
- -
- -
- - -
-

Methods

- -
- - new   - - to_s   - -
-
- -
- - - -
- - - -
-

Attributes

- -
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
baseline [RW] 
count [RW] 
index_path [RW] 
memory [RW] 
offset [RW] 
print_docs [RW] 
print_query [RW] 
rule [RW] 
run_id [RW] 
-
-
- - - - -
- -

Public Class methods

- - - - - -

Public Instance methods

- - -
- - - - -
- -
-
- - - -
- - - - -
- - - - - diff --git a/doc/classes/RIR/Indri/Parameters.src/M000016.html b/doc/classes/RIR/Indri/Parameters.src/M000016.html deleted file mode 100644 index a5d26e0..0000000 --- a/doc/classes/RIR/Indri/Parameters.src/M000016.html +++ /dev/null @@ -1,21 +0,0 @@ - - - - new (RIR::Indri::Parameters) - - - - -
# File lib/rir/query.rb, line 30
-      def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false)
-        @index_path  = corpus
-        @memory      = mem
-        @count       = count
-        @offset      = offset
-        @run_id      = run_id
-        @print_query = print_query ? "true" : "false"
-        @print_docs  = print_docs  ? "true" : "false"
-      end
- - diff --git a/doc/classes/RIR/Indri/Parameters.src/M000017.html b/doc/classes/RIR/Indri/Parameters.src/M000017.html deleted file mode 100644 index 3a235ca..0000000 --- a/doc/classes/RIR/Indri/Parameters.src/M000017.html +++ /dev/null @@ -1,29 +0,0 @@ - - - - to_s (RIR::Indri::Parameters) - - - - -
# File lib/rir/query.rb, line 40
-      def to_s
-        h = "<parameters>\n"
-        h += "<memory>#{@memory}</memory>\n"
-        h += "<index>#{@index_path}</index>\n"
-        h += "<count>#{@count}</count>\n"
-        unless @baseline.nil?
-          h += "<baseline>#{@baseline}</baseline>\n" 
-        else
-          h += "<rule>#{@rule}</rule>\n"
-        end
-        h += "<queryOffset>#{@offset}</queryOffset>\n"
-        h += "<runID>#{@run_id}</runID>\n"
-        h += "<printQuery>#{@print_query}</printQuery>\n"
-        h += "<printDocuments>#{@print_docs}</printDocuments>\n"
-
-        h
-      end
- - diff --git a/doc/classes/RIR/Query.html b/doc/classes/RIR/Query.html deleted file mode 100644 index ba41f8d..0000000 --- a/doc/classes/RIR/Query.html +++ /dev/null @@ -1,110 +0,0 @@ - - - - Class: RIR::Query [RDoc Documentation] - - - - - - - - - -
- - - - - - - - - - - - - - - - -
ClassRIR::Query
In: - - - - - lib/rir/query.rb - - - - -
- -
Parent: - - Object - -
-
- - -
- -
- -
- - -
- - - -
- - - - - - - - - -
- - - - - diff --git a/doc/classes/RIR/TreeTagger.html b/doc/classes/RIR/TreeTagger.html deleted file mode 100644 index a46f3bd..0000000 --- a/doc/classes/RIR/TreeTagger.html +++ /dev/null @@ -1,123 +0,0 @@ - - - - Module: RIR::TreeTagger [RDoc Documentation] - - - - - - - - - -
- - - - - - - - - - - -
ModuleRIR::TreeTagger
In: - - - - - lib/rir/ttagger.rb - - - - -
- -
-
- - -
- -
- -
-

-TreeTagger-related stuff module. -

-

-See www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/DecisionTreeTagger.html -

- -
- -
- - -
- - - - - - - - - diff --git a/doc/classes/RIR/TreeTagger/Chunk.html b/doc/classes/RIR/TreeTagger/Chunk.html deleted file mode 100644 index 20dca98..0000000 --- a/doc/classes/RIR/TreeTagger/Chunk.html +++ /dev/null @@ -1,187 +0,0 @@ - - - - Class: RIR::TreeTagger::Chunk [RDoc Documentation] - - - - - - - - - -
- - - - - - - - - - - - - - - - -
ClassRIR::TreeTagger::Chunk
In: - - - - - lib/rir/ttagger.rb - - - - -
- -
Parent: - - Object - -
-
- - -
- -
- -
-

-Represents a Chunk extracted when parsing a TaggerChunker file. -

- -
- -
- - -
-

Methods

- -
- - new   - -
-
- -
- - - -
- - - -
-

Attributes

- -
- - - - - - - - - - - - - - - - - - -
tag [R] 
words [R] 
-
-
- - - - -
- -

Public Class methods

- - -
- - - - -
- -

-str are whitespace-separated terms. tag see : ftp.ims.uni-stuttgart.de/pub/corpora/chunker-tagset-english.txt -

- -
-
- - - -
- - - - -
- - - - - diff --git a/doc/classes/RIR/TreeTagger/Chunk.src/M000015.html b/doc/classes/RIR/TreeTagger/Chunk.src/M000015.html deleted file mode 100644 index 239dc5a..0000000 --- a/doc/classes/RIR/TreeTagger/Chunk.src/M000015.html +++ /dev/null @@ -1,16 +0,0 @@ - - - - new (RIR::TreeTagger::Chunk) - - - - -
# File lib/rir/ttagger.rb, line 86
-      def initialize str,tag
-        @words = str.split
-        @tag   = tag[1..-2]
-      end
- - diff --git a/doc/classes/RIR/TreeTagger/TaggerChunker.html b/doc/classes/RIR/TreeTagger/TaggerChunker.html deleted file mode 100644 index dc13c75..0000000 --- a/doc/classes/RIR/TreeTagger/TaggerChunker.html +++ /dev/null @@ -1,216 +0,0 @@ - - - - Class: RIR::TreeTagger::TaggerChunker [RDoc Documentation] - - - - - - - - - -
- - - - - - - - - - - - - - - - -
ClassRIR::TreeTagger::TaggerChunker
In: - - - - - lib/rir/ttagger.rb - - - - -
- -
Parent: - - Object - -
-
- - -
- -
- -
-

-This class handles generic parsing of tagger-chunker outputs. -

- -
- -
- - -
-

Methods

- -
- - new   - - parse   - -
-
- -
- - - -
- - - -
-

Attributes

- -
- - - - - - - - - - - - - - - - - - -
chunks [R] 
file [R] 
-
-
- - - - -
- -

Public Class methods

- - -
- - - - -
- -

-Initializes parsing. chunk_file is the output of -tagger-chunker- and must be a valid path to the file. -

-
-  TaggerChunker.new("ttout/2010020") #=> #<RIR::TreeTagger::TaggerChunker:0x92fd088 @chunks=[#<RIR::TreeTagger::Chunk:0x8ec5a10 @words=["robert", "schumann"], @tag="NC">, ...] ...>
-
- -
-
- - -
- - - - -
- -

-Parses a tagger-chunker output and returns an Array of Chunk. -

- -
-
- - - -
- - - - -
- - - - - diff --git a/doc/classes/RIR/TreeTagger/TaggerChunker.src/M000013.html b/doc/classes/RIR/TreeTagger/TaggerChunker.src/M000013.html deleted file mode 100644 index 3bdb228..0000000 --- a/doc/classes/RIR/TreeTagger/TaggerChunker.src/M000013.html +++ /dev/null @@ -1,39 +0,0 @@ - - - - parse (RIR::TreeTagger::TaggerChunker) - - - - -
# File lib/rir/ttagger.rb, line 33
-      def self.parse chunk_lines
-        open = false
-        tag  = nil
-
-        chunks = []
-        words  = []
-
-        chunk_lines.each do |l|
-          l.chomp!
-          if l =~ /^<\w+>$/
-            open = true
-            tag  = l
-          elsif l =~ /^<\/\w+>$/
-            if !words.empty? && open && l == tag.sub(/</, '</')
-              open = false
-              chunks.push Chunk.new(words.join(" "), tag) 
-              words.clear
-            else
-              next
-            end
-          else
-            words.push(l.split.first)
-          end
-        end
-
-        chunks
-      end
- - diff --git a/doc/classes/RIR/TreeTagger/TaggerChunker.src/M000014.html b/doc/classes/RIR/TreeTagger/TaggerChunker.src/M000014.html deleted file mode 100644 index c33487c..0000000 --- a/doc/classes/RIR/TreeTagger/TaggerChunker.src/M000014.html +++ /dev/null @@ -1,15 +0,0 @@ - - - - new (RIR::TreeTagger::TaggerChunker) - - - - -
# File lib/rir/ttagger.rb, line 65
-      def initialize chunk_file
-        @chunks = TaggerChunker.parse File.open(chunk_file).readlines
-      end
- - diff --git a/doc/classes/RIR/TreeTagger/TaggerChunkerEnglish.html b/doc/classes/RIR/TreeTagger/TaggerChunkerEnglish.html deleted file mode 100644 index f5a878f..0000000 --- a/doc/classes/RIR/TreeTagger/TaggerChunkerEnglish.html +++ /dev/null @@ -1,114 +0,0 @@ - - - - Class: RIR::TreeTagger::TaggerChunkerEnglish [RDoc Documentation] - - - - - - - - - -
- - - - - - - - - - - - - - - - -
ClassRIR::TreeTagger::TaggerChunkerEnglish
In: - - - - - lib/rir/ttagger.rb - - - - -
- -
Parent: - - - - RIR::TreeTagger::TaggerChunker - - - -
-
- - -
- -
- -
- - -
- - - -
- - - - - - - - - -
- - - - - diff --git a/doc/classes/RIR/TreeTagger/TaggerChunkerFrench.html b/doc/classes/RIR/TreeTagger/TaggerChunkerFrench.html deleted file mode 100644 index 203b487..0000000 --- a/doc/classes/RIR/TreeTagger/TaggerChunkerFrench.html +++ /dev/null @@ -1,114 +0,0 @@ - - - - Class: RIR::TreeTagger::TaggerChunkerFrench [RDoc Documentation] - - - - - - - - - -
- - - - - - - - - - - - - - - - -
ClassRIR::TreeTagger::TaggerChunkerFrench
In: - - - - - lib/rir/ttagger.rb - - - - -
- -
Parent: - - - - RIR::TreeTagger::TaggerChunker - - - -
-
- - -
- -
- -
- - -
- - - -
- - - - - - - - - -
- - - - - diff --git a/doc/classes/RIR/TreeTagger/TaggerChunkerGerman.html b/doc/classes/RIR/TreeTagger/TaggerChunkerGerman.html deleted file mode 100644 index fd7e410..0000000 --- a/doc/classes/RIR/TreeTagger/TaggerChunkerGerman.html +++ /dev/null @@ -1,114 +0,0 @@ - - - - Class: RIR::TreeTagger::TaggerChunkerGerman [RDoc Documentation] - - - - - - - - - -
- - - - - - - - - - - - - - - - -
ClassRIR::TreeTagger::TaggerChunkerGerman
In: - - - - - lib/rir/ttagger.rb - - - - -
- -
Parent: - - - - RIR::TreeTagger::TaggerChunker - - - -
-
- - -
- -
- -
- - -
- - - -
- - - - - - - - - -
- - - - - diff --git a/doc/classes/RIR/WebDocument.html b/doc/classes/RIR/WebDocument.html deleted file mode 100644 index 4f034b9..0000000 --- a/doc/classes/RIR/WebDocument.html +++ /dev/null @@ -1,209 +0,0 @@ - - - - Class: RIR::WebDocument [RDoc Documentation] - - - - - - - - - -
- - - - - - - - - - - - - - - - -
ClassRIR::WebDocument
In: - - - - - lib/rir/document.rb - - - - -
- -
Parent: - - - - RIR::Document - - - -
-
- - -
- -
- -
-

-A WebDocument is a Document with a url. -

- -
- -
- - -
-

Methods

- -
- - get_content   - - new   - -
-
- -
- - - -
- - - -
-

Attributes

- -
- - - - - - - - - - -
url [R] 
-
-
- - - - -
- -

Public Class methods

- - -
- - - - -
- -

-Returns the HTML text from the page of a given url. -

- -
-
- - -
- - - - -
- -

-WebDocument constructor, the content of the -Document is the HTML page without the tags. -

- -
-
- - - -
- - - - -
- - - - - diff --git a/doc/classes/RIR/WebDocument.src/M000028.html b/doc/classes/RIR/WebDocument.src/M000028.html deleted file mode 100644 index cf2b1a5..0000000 --- a/doc/classes/RIR/WebDocument.src/M000028.html +++ /dev/null @@ -1,16 +0,0 @@ - - - - get_content (RIR::WebDocument) - - - - -
# File lib/rir/document.rb, line 115
-    def self.get_content(url)
-      require 'net/http'
-      Net::HTTP.get(URI.parse(url))
-    end
- - diff --git a/doc/classes/RIR/WebDocument.src/M000029.html b/doc/classes/RIR/WebDocument.src/M000029.html deleted file mode 100644 index 6750287..0000000 --- a/doc/classes/RIR/WebDocument.src/M000029.html +++ /dev/null @@ -1,16 +0,0 @@ - - - - new (RIR::WebDocument) - - - - -
# File lib/rir/document.rb, line 122
-    def initialize(url)
-      @url = url
-      super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags
-    end
- - diff --git a/doc/classes/RIR/WikipediaPage.html b/doc/classes/RIR/WikipediaPage.html deleted file mode 100644 index 645a791..0000000 --- a/doc/classes/RIR/WikipediaPage.html +++ /dev/null @@ -1,204 +0,0 @@ - - - - Class: RIR::WikipediaPage [RDoc Documentation] - - - - - - - - - -
- - - - - - - - - - - - - - - - -
ClassRIR::WikipediaPage
In: - - - - - lib/rir/document.rb - - - - -
- -
Parent: - - - - RIR::WebDocument - - - -
-
- - -
- -
- -
-

-A WikipediaPage is a WebDocument. -

- -
- -
- - -
-

Methods

- -
- - get_url   - - search_homepage   - - search_wikipedia_titles   - -
-
- -
- - - -
- - - - - - -
- -

Public Class methods

- - -
- - - - -
- -
-
- - -
- - - - -
- -
-
- - - - - - -
- - - - -
- - - - - diff --git a/doc/classes/RIR/WikipediaPage.src/M000030.html b/doc/classes/RIR/WikipediaPage.src/M000030.html deleted file mode 100644 index 3318c27..0000000 --- a/doc/classes/RIR/WikipediaPage.src/M000030.html +++ /dev/null @@ -1,19 +0,0 @@ - - - - search_wikipedia_titles (RIR::WikipediaPage) - - - - -
# File lib/rir/document.rb, line 135
-    def self.search_wikipedia_titles(name)
-      raise ArgumentError, "Bad encoding", name unless name.isutf8
-
-      res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search']
-
-      res.collect { |e| e.attributes['title'] } unless res.nil?
-    end
- - diff --git a/doc/classes/RIR/WikipediaPage.src/M000031.html b/doc/classes/RIR/WikipediaPage.src/M000031.html deleted file mode 100644 index 01ebe85..0000000 --- a/doc/classes/RIR/WikipediaPage.src/M000031.html +++ /dev/null @@ -1,19 +0,0 @@ - - - - get_url (RIR::WikipediaPage) - - - - -
# File lib/rir/document.rb, line 143
-    def self.get_url(name)
-      raise ArgumentError, "Bad encoding", name unless name.isutf8
-
-      atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes
-
-      atts['fullurl'] if atts['missing'].nil?
-    end
- - diff --git a/doc/classes/RIR/WikipediaPage.src/M000032.html b/doc/classes/RIR/WikipediaPage.src/M000032.html deleted file mode 100644 index 41f155c..0000000 --- a/doc/classes/RIR/WikipediaPage.src/M000032.html +++ /dev/null @@ -1,17 +0,0 @@ - - - - search_homepage (RIR::WikipediaPage) - - - - -
# File lib/rir/document.rb, line 151
-    def self.search_homepage(name)
-      title = WikipediaPage.search_wikipedia_titles name
-
-      WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?
-    end
- - diff --git a/doc/classes/Regexp.html b/doc/classes/Regexp.html index 03160e6..ecb9d67 100644 --- a/doc/classes/Regexp.html +++ b/doc/classes/Regexp.html @@ -53,9 +53,9 @@ - + - lib/rir/regexp.rb + lib/mirimiri/regexp.rb diff --git a/doc/classes/Regexp.src/M000001.html b/doc/classes/Regexp.src/M000001.html deleted file mode 100644 index 5508495..0000000 --- a/doc/classes/Regexp.src/M000001.html +++ /dev/null @@ -1,15 +0,0 @@ - - - - negated (Regexp) - - - - -
# File lib/rir/regexp.rb, line 22
-  def negated
-    /^((?!#{self}).)*$/
-  end
- - diff --git a/doc/classes/Regexp.src/M000003.html b/doc/classes/Regexp.src/M000003.html index d55f814..309a1b7 100644 --- a/doc/classes/Regexp.src/M000003.html +++ b/doc/classes/Regexp.src/M000003.html @@ -7,7 +7,7 @@ -
# File lib/rir/regexp.rb, line 24
+  
# File lib/mirimiri/regexp.rb, line 24
   def negated
     /^((?!#{self}).)*$/
   end
diff --git a/doc/classes/String.html b/doc/classes/String.html index 1df0e56..200eba7 100644 --- a/doc/classes/String.html +++ b/doc/classes/String.html @@ -53,9 +53,9 @@ - + - lib/rir/string.rb + lib/mirimiri/string.rb @@ -133,7 +133,7 @@ useful function. diff --git a/doc/classes/String.src/M000002.html b/doc/classes/String.src/M000002.html deleted file mode 100644 index 603a7ac..0000000 --- a/doc/classes/String.src/M000002.html +++ /dev/null @@ -1,15 +0,0 @@ - - - - is_stopword? (String) - - - - -
# File lib/rir/string.rb, line 76
-  def is_stopword?
-    Stoplist.include?(self.downcase)
-  end
- - diff --git a/doc/classes/String.src/M000003.html b/doc/classes/String.src/M000003.html deleted file mode 100644 index c21c139..0000000 --- a/doc/classes/String.src/M000003.html +++ /dev/null @@ -1,15 +0,0 @@ - - - - remove_special_characters (String) - - - - -
# File lib/rir/string.rb, line 82
-  def remove_special_characters
-    self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ')
-  end
- - diff --git a/doc/classes/String.src/M000004.html b/doc/classes/String.src/M000004.html index 70c504f..825b9c3 100644 --- a/doc/classes/String.src/M000004.html +++ b/doc/classes/String.src/M000004.html @@ -7,7 +7,7 @@ -
# File lib/rir/string.rb, line 78
+  
# File lib/mirimiri/string.rb, line 78
   def is_stopword?
     Stoplist.include?(self.downcase)
   end
diff --git a/doc/classes/String.src/M000005.html b/doc/classes/String.src/M000005.html index 9073156..e5e0c82 100644 --- a/doc/classes/String.src/M000005.html +++ b/doc/classes/String.src/M000005.html @@ -7,7 +7,7 @@ -
# File lib/rir/string.rb, line 84
+  
# File lib/mirimiri/string.rb, line 84
   def remove_special_characters
     self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ')
   end
diff --git a/doc/classes/String.src/M000006.html b/doc/classes/String.src/M000006.html index 0d97963..43d4bb8 100644 --- a/doc/classes/String.src/M000006.html +++ b/doc/classes/String.src/M000006.html @@ -7,7 +7,7 @@ -
# File lib/rir/string.rb, line 93
+  
# File lib/mirimiri/string.rb, line 93
   def strip_xml_tags!
     replace strip_with_pattern /<\/?[^>]*>/
   end
diff --git a/doc/classes/String.src/M000007.html b/doc/classes/String.src/M000007.html index 00efa8f..5dc93b5 100644 --- a/doc/classes/String.src/M000007.html +++ b/doc/classes/String.src/M000007.html @@ -7,7 +7,7 @@ -
# File lib/rir/string.rb, line 102
+  
# File lib/mirimiri/string.rb, line 102
   def strip_xml_tags
     dup.strip_xml_tags!
   end
diff --git a/doc/classes/String.src/M000008.html b/doc/classes/String.src/M000008.html index 93970bf..86a8def 100644 --- a/doc/classes/String.src/M000008.html +++ b/doc/classes/String.src/M000008.html @@ -7,7 +7,7 @@ -
# File lib/rir/string.rb, line 116
+  
# File lib/mirimiri/string.rb, line 116
   def strip_javascripts!
     replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m 
   end
diff --git a/doc/classes/String.src/M000009.html b/doc/classes/String.src/M000009.html index b143c5a..d5ff9ad 100644 --- a/doc/classes/String.src/M000009.html +++ b/doc/classes/String.src/M000009.html @@ -7,7 +7,7 @@ -
# File lib/rir/string.rb, line 129
+  
# File lib/mirimiri/string.rb, line 129
   def strip_javascripts
     dup.strip_javascripts!
   end
diff --git a/doc/classes/String.src/M000010.html b/doc/classes/String.src/M000010.html index f26b6c6..278b649 100644 --- a/doc/classes/String.src/M000010.html +++ b/doc/classes/String.src/M000010.html @@ -7,7 +7,7 @@ -
# File lib/rir/string.rb, line 133
+  
# File lib/mirimiri/string.rb, line 133
   def strip_stylesheets!
   # TODO: rewamp. dunno what is it.
     replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m 
diff --git a/doc/classes/String.src/M000011.html b/doc/classes/String.src/M000011.html
index 00ac846..b4b63a2 100644
--- a/doc/classes/String.src/M000011.html
+++ b/doc/classes/String.src/M000011.html
@@ -7,7 +7,7 @@
   
 
 
-  
# File lib/rir/string.rb, line 138
+  
# File lib/mirimiri/string.rb, line 138
   def strip_stylesheets
     dup.strip_stylesheets!
   end
diff --git a/doc/classes/String.src/M000012.html b/doc/classes/String.src/M000012.html index a6e3495..33fc1bc 100644 --- a/doc/classes/String.src/M000012.html +++ b/doc/classes/String.src/M000012.html @@ -7,7 +7,7 @@ -
# File lib/rir/string.rb, line 147
+  
# File lib/mirimiri/string.rb, line 147
   def strip_punctuation!
     replace strip_with_pattern /[^a-zA-Z0-9\-\s]/
   end
diff --git a/doc/classes/String.src/M000013.html b/doc/classes/String.src/M000013.html index aa192ed..2dcb75e 100644 --- a/doc/classes/String.src/M000013.html +++ b/doc/classes/String.src/M000013.html @@ -7,7 +7,7 @@ -
# File lib/rir/string.rb, line 155
+  
# File lib/mirimiri/string.rb, line 155
   def strip_punctuation
     dup.strip_punctuation!
   end
diff --git a/doc/classes/String.src/M000014.html b/doc/classes/String.src/M000014.html index d6e6648..fdc6442 100644 --- a/doc/classes/String.src/M000014.html +++ b/doc/classes/String.src/M000014.html @@ -7,7 +7,7 @@ -
# File lib/rir/string.rb, line 163
+  
# File lib/mirimiri/string.rb, line 163
   def extract_xmltags_values(tag_name)
     self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten
   end
diff --git a/doc/classes/TreeTagger.html b/doc/classes/TreeTagger.html index 7ca1358..0c18ef6 100644 --- a/doc/classes/TreeTagger.html +++ b/doc/classes/TreeTagger.html @@ -53,9 +53,9 @@ - + - lib/rir/ttagger.rb + lib/mirimiri/ttagger.rb diff --git a/doc/classes/TreeTagger/Chunk.html b/doc/classes/TreeTagger/Chunk.html index 4aec6b0..a8b24a8 100644 --- a/doc/classes/TreeTagger/Chunk.html +++ b/doc/classes/TreeTagger/Chunk.html @@ -53,9 +53,9 @@ - + - lib/rir/ttagger.rb + lib/mirimiri/ttagger.rb diff --git a/doc/classes/TreeTagger/Chunk.src/M000017.html b/doc/classes/TreeTagger/Chunk.src/M000017.html index 739251a..a6ad0c9 100644 --- a/doc/classes/TreeTagger/Chunk.src/M000017.html +++ b/doc/classes/TreeTagger/Chunk.src/M000017.html @@ -7,7 +7,7 @@ -
# File lib/rir/ttagger.rb, line 89
+  
# File lib/mirimiri/ttagger.rb, line 89
     def initialize str,tag
       @words = str.split
       @tag   = tag[1..-2]
diff --git a/doc/classes/TreeTagger/TaggerChunker.html b/doc/classes/TreeTagger/TaggerChunker.html
index 2e7f693..822b75c 100644
--- a/doc/classes/TreeTagger/TaggerChunker.html
+++ b/doc/classes/TreeTagger/TaggerChunker.html
@@ -53,9 +53,9 @@
             
 
 
-                
+                
 
-                lib/rir/ttagger.rb
+                lib/mirimiri/ttagger.rb
 
                 
 
diff --git a/doc/classes/TreeTagger/TaggerChunker.src/M000015.html b/doc/classes/TreeTagger/TaggerChunker.src/M000015.html
index a3a3546..661f5e4 100644
--- a/doc/classes/TreeTagger/TaggerChunker.src/M000015.html
+++ b/doc/classes/TreeTagger/TaggerChunker.src/M000015.html
@@ -7,7 +7,7 @@
   
 
 
-  
# File lib/rir/ttagger.rb, line 34
+  
# File lib/mirimiri/ttagger.rb, line 34
     def self.parse chunk_lines
       open = false
       tag  = nil
diff --git a/doc/classes/TreeTagger/TaggerChunker.src/M000016.html b/doc/classes/TreeTagger/TaggerChunker.src/M000016.html
index 6652c5c..196758d 100644
--- a/doc/classes/TreeTagger/TaggerChunker.src/M000016.html
+++ b/doc/classes/TreeTagger/TaggerChunker.src/M000016.html
@@ -7,7 +7,7 @@
   
 
 
-  
# File lib/rir/ttagger.rb, line 66
+  
# File lib/mirimiri/ttagger.rb, line 66
     def initialize chunk_file
       @chunks = TaggerChunker.parse File.open(chunk_file).readlines
     end
diff --git a/doc/classes/TreeTagger/TaggerChunkerEnglish.html b/doc/classes/TreeTagger/TaggerChunkerEnglish.html index 45936ae..9159e78 100644 --- a/doc/classes/TreeTagger/TaggerChunkerEnglish.html +++ b/doc/classes/TreeTagger/TaggerChunkerEnglish.html @@ -53,9 +53,9 @@ - + - lib/rir/ttagger.rb + lib/mirimiri/ttagger.rb diff --git a/doc/classes/TreeTagger/TaggerChunkerFrench.html b/doc/classes/TreeTagger/TaggerChunkerFrench.html index 8309193..6de7908 100644 --- a/doc/classes/TreeTagger/TaggerChunkerFrench.html +++ b/doc/classes/TreeTagger/TaggerChunkerFrench.html @@ -53,9 +53,9 @@ - + - lib/rir/ttagger.rb + lib/mirimiri/ttagger.rb diff --git a/doc/classes/TreeTagger/TaggerChunkerGerman.html b/doc/classes/TreeTagger/TaggerChunkerGerman.html index e9a998a..ad26322 100644 --- a/doc/classes/TreeTagger/TaggerChunkerGerman.html +++ b/doc/classes/TreeTagger/TaggerChunkerGerman.html @@ -53,9 +53,9 @@ - + - lib/rir/ttagger.rb + lib/mirimiri/ttagger.rb diff --git a/doc/created.rid b/doc/created.rid index 896d22c..95136cf 100644 --- a/doc/created.rid +++ b/doc/created.rid @@ -1 +1 @@ -Thu, 25 Nov 2010 17:21:51 +0100 +Mon, 20 Dec 2010 10:38:08 +0100 diff --git a/doc/files/lib/rir/corpus_rb.html b/doc/files/lib/rir/corpus_rb.html deleted file mode 100644 index a211e38..0000000 --- a/doc/files/lib/rir/corpus_rb.html +++ /dev/null @@ -1,90 +0,0 @@ - - - - File: corpus.rb [RDoc Documentation] - - - - - - - - - -
-

corpus.rb

- - - - - - - - - -
Path:lib/rir/corpus.rb - -
Last Update:2010-11-25 17:20:52 +0100
-
- - -
- -
- -
- - -
- - - -
- - - - - - - - - -
- - - - - diff --git a/doc/files/lib/rir/document_rb.html b/doc/files/lib/rir/document_rb.html deleted file mode 100644 index 3dd8c6c..0000000 --- a/doc/files/lib/rir/document_rb.html +++ /dev/null @@ -1,106 +0,0 @@ - - - - File: document.rb [RDoc Documentation] - - - - - - - - - -
-

document.rb

- - - - - - - - - -
Path:lib/rir/document.rb - -
Last Update:2010-11-25 17:20:25 +0100
-
- - -
- -
- -
-

Required files

- -
- - net/http   - - rexml/document   - - net/http   - - kconv   - -
-
- -
- - -
- - - -
- - - - - - - - - -
- - - - - diff --git a/doc/files/lib/rir/query_rb.html b/doc/files/lib/rir/query_rb.html deleted file mode 100644 index 7c34092..0000000 --- a/doc/files/lib/rir/query_rb.html +++ /dev/null @@ -1,90 +0,0 @@ - - - - File: query.rb [RDoc Documentation] - - - - - - - - - -
-

query.rb

- - - - - - - - - -
Path:lib/rir/query.rb - -
Last Update:2010-11-25 17:21:14 +0100
-
- - -
- -
- -
- - -
- - - -
- - - - - - - - - -
- - - - - diff --git a/doc/files/lib/rir/regexp_rb.html b/doc/files/lib/rir/regexp_rb.html deleted file mode 100644 index a8a4bf1..0000000 --- a/doc/files/lib/rir/regexp_rb.html +++ /dev/null @@ -1,90 +0,0 @@ - - - - File: regexp.rb [RDoc Documentation] - - - - - - - - - -
-

regexp.rb

- - - - - - - - - -
Path:lib/rir/regexp.rb - -
Last Update:2010-11-25 17:19:39 +0100
-
- - -
- -
- -
- - -
- - - -
- - - - - - - - - -
- - - - - diff --git a/doc/files/lib/rir/string_rb.html b/doc/files/lib/rir/string_rb.html deleted file mode 100644 index 1d78744..0000000 --- a/doc/files/lib/rir/string_rb.html +++ /dev/null @@ -1,109 +0,0 @@ - - - - File: string.rb [RDoc Documentation] - - - - - - - - - -
-

string.rb

- - - - - - - - - -
Path:lib/rir/string.rb - -
Last Update:2010-11-25 17:20:14 +0100
-
- - -
- -
- -
-

-General module for many purposes related to Information Retrieval. -

- -
- -
-

Required files

- -
- - cgi   - - kconv   - -
-
- -
- - -
- - - -
- - - - - - - - - -
- - - - - diff --git a/doc/files/lib/rir/ttagger_rb.html b/doc/files/lib/rir/ttagger_rb.html deleted file mode 100644 index 8ba5cc5..0000000 --- a/doc/files/lib/rir/ttagger_rb.html +++ /dev/null @@ -1,90 +0,0 @@ - - - - File: ttagger.rb [RDoc Documentation] - - - - - - - - - -
-

ttagger.rb

- - - - - - - - - -
Path:lib/rir/ttagger.rb - -
Last Update:2010-11-25 17:21:44 +0100
-
- - -
- -
- -
- - -
- - - -
- - - - - - - - - -
- - - - - diff --git a/doc/files/lib/rir_rb.html b/doc/files/lib/rir_rb.html deleted file mode 100644 index 6486ffa..0000000 --- a/doc/files/lib/rir_rb.html +++ /dev/null @@ -1,110 +0,0 @@ - - - - File: rir.rb [RDoc Documentation] - - - - - - - - - -
-

rir.rb

- - - - - - - - - -
Path:lib/rir.rb - -
Last Update:2010-11-25 15:44:52 +0100
-
- - -
- -
- -
-

Required files

- -
- - rir/document   - - rir/string   - - rir/query   - - rir/corpus   - - rir/regexp   - - rir/ttagger   - -
-
- -
- - -
- - - -
- - - - - - - - - -
- - - - - diff --git a/doc/fr_class_index.html b/doc/fr_class_index.html index 43770f4..e605266 100644 --- a/doc/fr_class_index.html +++ b/doc/fr_class_index.html @@ -25,15 +25,15 @@ Indri::Parameters
- Query
+ Mirimiri
- RIR
+ Mirimiri::Document
- RIR::Document
+ Mirimiri::WebDocument
- RIR::WebDocument
+ Mirimiri::WikipediaPage
- RIR::WikipediaPage
+ Query
Regexp
diff --git a/doc/fr_file_index.html b/doc/fr_file_index.html index 96d6676..38d01a5 100644 --- a/doc/fr_file_index.html +++ b/doc/fr_file_index.html @@ -17,17 +17,19 @@

Files

diff --git a/doc/fr_method_index.html b/doc/fr_method_index.html index d9ad3c7..b53d149 100644 --- a/doc/fr_method_index.html +++ b/doc/fr_method_index.html @@ -17,27 +17,27 @@

Methods

- count_words (RIR::Document)
+ count_words (Mirimiri::Document)
- entropy (RIR::Document)
+ entropy (Mirimiri::Document)
extract_xmltags_values (String)
files (Corpus)
- format_words (RIR::Document)
+ format_words (Mirimiri::Document)
- get_content (RIR::WebDocument)
+ get_content (Mirimiri::WebDocument)
- get_url (RIR::WikipediaPage)
+ get_url (Mirimiri::WikipediaPage)
is_stopword? (String)
negated (Regexp)
- new (RIR::Document)
+ new (Mirimiri::Document)
- new (RIR::WebDocument)
+ new (Mirimiri::WebDocument)
new (Indri::IndriQuery)
@@ -49,15 +49,15 @@ new (TreeTagger::Chunk)
- ngrams (RIR::Document)
+ ngrams (Mirimiri::Document)
parse (TreeTagger::TaggerChunker)
remove_special_characters (String)
- search_homepage (RIR::WikipediaPage)
+ search_homepage (Mirimiri::WikipediaPage)
- search_wikipedia_titles (RIR::WikipediaPage)
+ search_wikipedia_titles (Mirimiri::WikipediaPage)
strip_javascripts (String)
@@ -75,7 +75,7 @@ strip_xml_tags! (String)
- tf (RIR::Document)
+ tf (Mirimiri::Document)
to_s (Indri::Parameters)
diff --git a/doc/index.html b/doc/index.html index cb3feae..31c34b3 100644 --- a/doc/index.html +++ b/doc/index.html @@ -16,6 +16,6 @@ - + diff --git a/lib/rir.rb b/lib/rir.rb deleted file mode 100644 index e21e097..0000000 --- a/lib/rir.rb +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/env ruby - -require 'rir/document' -require 'rir/string' -require 'rir/query' -require 'rir/corpus' -require 'rir/regexp' -require 'rir/ttagger' diff --git a/lib/rir/corpus.rb b/lib/rir/corpus.rb deleted file mode 100644 index 21555c0..0000000 --- a/lib/rir/corpus.rb +++ /dev/null @@ -1,38 +0,0 @@ -#!/usr/bin/env ruby - -#-- -# This file is a part of an Information Retrieval oriented Ruby library -# -# Copyright (C) 2010-2011 Romain Deveaud -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -#++ - -class Corpus - attr_accessor :path - - def initialize(path) - @path = path.chomp "/" - end - - # Recursively outputs all files in +self.path+. - # WARNING ! This function may take a lot of time if many - # files are in subdirectories. - # - # c = Corpus.new "my/path" - # c.files # => ["README.txt", "lib/code.rb"] - def files - Dir["#{@path}/**/*.*"] - end -end diff --git a/lib/rir/document.rb b/lib/rir/document.rb deleted file mode 100644 index 2ed0a59..0000000 --- a/lib/rir/document.rb +++ /dev/null @@ -1,163 +0,0 @@ -#!/usr/bin/env ruby - -#-- -# This file is a part of an Information Retrieval oriented Ruby library -# -# Copyright (C) 2010-2011 Romain Deveaud -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -#++ - - -# General module for many purposes related to Information Retrieval. -module RIR - - # A Document is a bag of words and is constructed from a string. - class Document - attr_reader :words, :doc_content - - # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html - # and the \\W special escape). - # - # Protected function, only meant to by called at the initialization. - def format_words - wo = [] - - @doc_content.split.each do |w| - w.split(/\W/).each do |sw| - wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ - end - end - - wo - end - - # Returns an Array containing the +n+-grams (words) from the current Document. - # - # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] - def ngrams(n) - window = [] - ngrams_array = [] - - @words.each do |w| - window.push(w) - if window.size == n - ngrams_array.push window.join(" ") - window.delete_at(0) - end - end - - ngrams_array.uniq - end - - # Returns a Hash containing the words and their associated counts in the current Document. - # - # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } - def count_words - counts = Hash.new { |h,k| h[k] = 0 } - @words.each { |w| counts[w] += 1 } - - counts - end - - # Computes the entropy of a given string +s+ inside the document. - # - # If the string parameter is composed of many words (i.e. tokens separated - # by whitespace(s)), it is considered as an ngram. - # - # entropy("guitar") #=> 0.00432114812727959 - # entropy("dillinger escape plan") #=> 0.265862076325102 - def entropy(s) - en = 0.0 - counts = self.count_words - - s.split.each do |w| - p_wi = counts[w].to_f/@words.count.to_f - en += p_wi*Math.log2(p_wi) - end - - en *= -1 - en - end - - # Computes the term frequency of a given *word* +s+. - # - # tf("guitar") #=> 0.000380372765310004 - def tf(s) - self.count_words[s].to_f/@words.size.to_f - end - - - def initialize(content) - @doc_content = content - @words = format_words - end - - protected :format_words - end - - # A WebDocument is a Document with a +url+. - class WebDocument < Document - attr_reader :url - - # Returns the HTML text from the page of a given +url+. - def self.get_content(url) - require 'net/http' - Net::HTTP.get(URI.parse(url)) - end - - # WebDocument constructor, the content of the Document is the HTML page - # without the tags. - def initialize(url) - @url = url - super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags - end - end - - # A WikipediaPage is a WebDocument. - class WikipediaPage < WebDocument - require 'rexml/document' - require 'net/http' - require 'kconv' - - - def self.search_wikipedia_titles(name) - raise ArgumentError, "Bad encoding", name unless name.isutf8 - - res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search'] - - res.collect { |e| e.attributes['title'] } unless res.nil? - end - - def self.get_url(name) - raise ArgumentError, "Bad encoding", name unless name.isutf8 - - atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes - - atts['fullurl'] if atts['missing'].nil? - end - - def self.search_homepage(name) - title = WikipediaPage.search_wikipedia_titles name - - WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? - end - -# def initialize(name) -# title = WikipediaPage.search_wikipedia_titles name -# raise ArgumentError, "No page found" if title.empty? -# super WikipediaPage.get_url title[0] -# end - end -end diff --git a/lib/rir/query.rb b/lib/rir/query.rb deleted file mode 100644 index 310d808..0000000 --- a/lib/rir/query.rb +++ /dev/null @@ -1,85 +0,0 @@ -#!/usr/bin/env ruby - -#-- -# This file is a part of an Information Retrieval oriented Ruby library -# -# Copyright (C) 2010-2011 Romain Deveaud -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -#++ - -class Query -end - -module Indri - - class Parameters - attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline - - def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false) - @index_path = corpus - @memory = mem - @count = count - @offset = offset - @run_id = run_id - @print_query = print_query ? "true" : "false" - @print_docs = print_docs ? "true" : "false" - end - - def to_s - h = "\n" - h += "#{@memory}\n" - h += "#{@index_path}\n" - h += "#{@count}\n" - unless @baseline.nil? - h += "#{@baseline}\n" - else - h += "#{@rule}\n" - end - h += "#{@offset}\n" - h += "#{@run_id}\n" - h += "#{@print_query}\n" - h += "#{@print_docs}\n" - - h - end - end - - class IndriQuery < Query - attr_accessor :id, :query, :params, :rule - - def initialize(id,query,params) - @params = params - # Here we set the default retrieval model as Language Modeling - # with a Dirichlet smoothing at 2500. - # TODO: maybe a Rule class... - @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? - - @id = id - @query = query - end - - def to_s - h = @params.to_s - h += "\n" - h += "#{@id}\n" - h += "#{@query}\n" - h += "\n" - h += "" - - h - end - end - -end diff --git a/lib/rir/regexp.rb b/lib/rir/regexp.rb deleted file mode 100644 index e2284b7..0000000 --- a/lib/rir/regexp.rb +++ /dev/null @@ -1,28 +0,0 @@ -#!/usr/bin/env ruby - -#-- -# This file is a part of an Information Retrieval oriented Ruby library -# -# Copyright (C) 2010-2011 Romain Deveaud -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -#++ - -class Regexp - - def negated - /^((?!#{self}).)*$/ - end - -end diff --git a/lib/rir/string.rb b/lib/rir/string.rb deleted file mode 100644 index efc06a3..0000000 --- a/lib/rir/string.rb +++ /dev/null @@ -1,174 +0,0 @@ -#!/usr/bin/env ruby - -#-- -# This file is a part of an Information Retrieval oriented Ruby library -# -# Copyright (C) 2010-2011 Romain Deveaud -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . -#++ - -module RIR - - # These are the default stopwords provided by Lemur. - Stoplist = [ - "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av", - "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", - "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", - "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu", - "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during", - "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every", - "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting", - "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff", - "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore", - "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he", - "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto", - "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto", - "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include", - "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into", - "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last", - "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe", - "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs", - "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless", - "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing", - "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once", - "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", - "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite", - "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing", - "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt", - "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote", - "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", - "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave", - "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them", - "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts", - "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", - "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru", - "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh", - "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward", - "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week", - "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", - "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore", - "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", - "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever", - "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom", - "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within", - "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your", - "yours", "yourself", "yourselves" - ] - - -end - -# Extention of the standard class String with useful function. -class String - include RIR - - # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise. - def is_stopword? - Stoplist.include?(self.downcase) - end - - # Do not use. - # TODO: rewamp. find why this function is here. - def remove_special_characters - self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ') - end - - # Removes all XML-like tags from +self+. - # - # s = "test" - # s.strip_xml_tags! - # s #=> "test" - def strip_xml_tags! - replace strip_with_pattern /<\/?[^>]*>/ - end - - # Removes all XML-like tags from +self+. - # - # s = "test" - # s.strip_xml_tags #=> "test" - # s #=> "test" - def strip_xml_tags - dup.strip_xml_tags! - end - - # Removes all Javascript sources from +self+. - # - # s = " - # - # test" - # s.strip_javascripts! - # s #=> "test" - def strip_javascripts! - replace strip_with_pattern / - # - # test" - # s.strip_javascripts #=> "test" - def strip_javascripts - dup.strip_javascripts! - end - - def strip_stylesheets! - # TODO: rewamp. dunno what is it. - replace strip_with_pattern /