diff --git a/doc/classes/RIR.html b/doc/classes/RIR.html index e909f57..77d50d4 100644 --- a/doc/classes/RIR.html +++ b/doc/classes/RIR.html @@ -53,9 +53,9 @@ - + - lib/rir/corpus.rb + lib/rir/ttagger.rb @@ -73,9 +73,9 @@
- + - lib/rir/string.rb + lib/rir/document.rb @@ -142,27 +142,7 @@ with this program. If not, see <www.gnu.org/licenses/>.


-This file is a part of an Information Retrieval oriented Ruby library -

-

-Copyright (C) 2010-2011 Romain Deveaud -

-

-This program is free software: you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the Free -Software Foundation, either version 3 of the License, or (at your option) -any later version. -

-

-This program is distributed in the hope that it will be useful, but WITHOUT -ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -more details. -

-

-You should have received a copy of the GNU General Public License along -with this program. If not, see <www.gnu.org/licenses/>. +General module for many purposes related to Information Retrieval.

@@ -180,34 +160,14 @@ href="http://www.gnu.org/licenses/">www.gnu.org/licenses/>.

Classes and Modules

Module RIR::Indri
-Class RIR::Corpus
+Module RIR::TreeTagger
+Class RIR::Document
Class RIR::Query
+Class RIR::WebDocument
+Class RIR::WikipediaPage
-
-

Constants

- -
- - - - - - - - - - - - -
Stoplist=[ "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu", "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during", "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting", "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff", "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore", "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he", "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto", "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto", "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include", "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into", "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last", "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe", "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs", "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless", "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing", "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite", "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt", "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave", "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them", "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts", "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru", "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh", "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward", "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week", "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore", "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever", "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom", "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within", "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your", "yours", "yourself", "yourselves" ]  -These are the default stopwords provided by Lemur. - -
-
-
- diff --git a/doc/classes/RIR/Document.html b/doc/classes/RIR/Document.html index 8643cb5..00f66b4 100644 --- a/doc/classes/RIR/Document.html +++ b/doc/classes/RIR/Document.html @@ -99,15 +99,17 @@ from a string.
- count_words   + count_words   - entropy   + entropy   - format_words   + format_words   - new   + new   - ngrams   + ngrams   + + tf  
@@ -154,13 +156,13 @@ from a string.

Public Class methods

-
- +
+
- + new(content) @@ -177,13 +179,13 @@ from a string.

Public Instance methods

-
- +
+ -
- + -
- +
+ +
+ + + + +
+ +

+Computes the term frequency of a given word s. +

+
+  tf("guitar") #=> 0.000380372765310004
+
+ +
+
+ +

Protected Instance methods

-
- +
+
- + format_words() diff --git a/doc/classes/RIR/Document.src/M000008.html b/doc/classes/RIR/Document.src/M000008.html new file mode 100644 index 0000000..72c51f5 --- /dev/null +++ b/doc/classes/RIR/Document.src/M000008.html @@ -0,0 +1,23 @@ + + + + format_words (RIR::Document) + + + + +
# File lib/rir/document.rb, line 31
+    def format_words
+      wo = []
+
+      @doc_content.split.each do |w|
+        w.split(/\W/).each do |sw| 
+          wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ 
+        end
+      end
+      
+      wo
+    end
+ + diff --git a/doc/classes/RIR/Document.src/M000009.html b/doc/classes/RIR/Document.src/M000009.html new file mode 100644 index 0000000..6257629 --- /dev/null +++ b/doc/classes/RIR/Document.src/M000009.html @@ -0,0 +1,26 @@ + + + + ngrams (RIR::Document) + + + + +
# File lib/rir/document.rb, line 46
+    def ngrams(n)
+      window       = []
+      ngrams_array = []
+
+      @words.each do |w|
+        window.push(w)
+        if window.size == n
+          ngrams_array.push window.join(" ")
+          window.delete_at(0)
+        end
+      end
+
+      ngrams_array.uniq
+    end
+ + diff --git a/doc/classes/RIR/Document.src/M000010.html b/doc/classes/RIR/Document.src/M000010.html index 5056008..e8ddeec 100644 --- a/doc/classes/RIR/Document.src/M000010.html +++ b/doc/classes/RIR/Document.src/M000010.html @@ -2,22 +2,17 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> - format_words (RIR::Document) + count_words (RIR::Document) -
# File lib/rir/document.rb, line 31
-    def format_words
-      wo = []
+  
# File lib/rir/document.rb, line 64
+    def count_words
+      counts = Hash.new { |h,k| h[k] = 0 }
+      @words.each { |w| counts[w] += 1 }
 
-      @doc_content.split.each do |w|
-        w.split(/\W/).each do |sw| 
-          wo.push(sw) if sw =~ /[a-zA-Z]/ 
-        end
-      end
-      
-      wo
+      counts
     end
diff --git a/doc/classes/RIR/Document.src/M000011.html b/doc/classes/RIR/Document.src/M000011.html index 6257629..50f6db7 100644 --- a/doc/classes/RIR/Document.src/M000011.html +++ b/doc/classes/RIR/Document.src/M000011.html @@ -2,25 +2,23 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> - ngrams (RIR::Document) + entropy (RIR::Document) -
# File lib/rir/document.rb, line 46
-    def ngrams(n)
-      window       = []
-      ngrams_array = []
+  
# File lib/rir/document.rb, line 78
+    def entropy(s)
+      en = 0.0
+      counts = self.count_words
 
-      @words.each do |w|
-        window.push(w)
-        if window.size == n
-          ngrams_array.push window.join(" ")
-          window.delete_at(0)
-        end
+      s.split.each do |w|
+        p_wi = counts[w].to_f/@words.count.to_f
+        en += p_wi*Math.log2(p_wi)
       end
 
-      ngrams_array.uniq
+      en *= -1
+      en
     end
diff --git a/doc/classes/RIR/Document.src/M000012.html b/doc/classes/RIR/Document.src/M000012.html index 978234a..eb4436b 100644 --- a/doc/classes/RIR/Document.src/M000012.html +++ b/doc/classes/RIR/Document.src/M000012.html @@ -2,17 +2,14 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> - count_words (RIR::Document) + tf (RIR::Document) -
# File lib/rir/document.rb, line 64
-    def count_words
-      counts = Hash.new { |h,k| h[k] = 0 }
-      @words.each { |w| counts[w.downcase] += 1 }
-
-      counts
+  
# File lib/rir/document.rb, line 94
+    def tf(s)
+      self.count_words[s].to_f/@words.size.to_f
     end
diff --git a/doc/classes/RIR/Document.src/M000013.html b/doc/classes/RIR/Document.src/M000013.html index 5694971..1ef96d5 100644 --- a/doc/classes/RIR/Document.src/M000013.html +++ b/doc/classes/RIR/Document.src/M000013.html @@ -2,23 +2,15 @@ "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> - entropy (RIR::Document) + new (RIR::Document) -
# File lib/rir/document.rb, line 77
-    def entropy(s)
-      en = 0.0
-      counts = self.count_words
-
-      s.split.each do |w|
-        p_wi = counts[w].to_f/@words.count.to_f
-        en += p_wi*Math.log2(p_wi)
-      end
-
-      en *= -1
-      en
+  
# File lib/rir/document.rb, line 99
+    def initialize(content)
+      @doc_content = content
+      @words = format_words
     end
diff --git a/doc/classes/RIR/Indri/IndriQuery.html b/doc/classes/RIR/Indri/IndriQuery.html index 5bd769e..922525d 100644 --- a/doc/classes/RIR/Indri/IndriQuery.html +++ b/doc/classes/RIR/Indri/IndriQuery.html @@ -95,9 +95,9 @@
- new   + new   - to_s   + to_s  
@@ -160,13 +160,13 @@

Public Class methods

-
- +
+
- + new(id,query,params) @@ -183,13 +183,13 @@

Public Instance methods

-
- +
+ @@ -121,7 +121,7 @@ - corpus + count  [RW]  @@ -129,7 +129,7 @@ - count + index_path  [RW]  @@ -196,13 +196,13 @@

Public Class methods

-
- +
+
- + new(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false) @@ -219,13 +219,13 @@

Public Instance methods

-
- +
+
- + to_s() diff --git a/doc/classes/RIR/Indri/Parameters.src/M000004.html b/doc/classes/RIR/Indri/Parameters.src/M000004.html new file mode 100644 index 0000000..a5d26e0 --- /dev/null +++ b/doc/classes/RIR/Indri/Parameters.src/M000004.html @@ -0,0 +1,21 @@ + + + + new (RIR::Indri::Parameters) + + + + +
# File lib/rir/query.rb, line 30
+      def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false)
+        @index_path  = corpus
+        @memory      = mem
+        @count       = count
+        @offset      = offset
+        @run_id      = run_id
+        @print_query = print_query ? "true" : "false"
+        @print_docs  = print_docs  ? "true" : "false"
+      end
+ + diff --git a/doc/classes/RIR/Indri/Parameters.src/M000005.html b/doc/classes/RIR/Indri/Parameters.src/M000005.html new file mode 100644 index 0000000..3a235ca --- /dev/null +++ b/doc/classes/RIR/Indri/Parameters.src/M000005.html @@ -0,0 +1,29 @@ + + + + to_s (RIR::Indri::Parameters) + + + + +
# File lib/rir/query.rb, line 40
+      def to_s
+        h = "<parameters>\n"
+        h += "<memory>#{@memory}</memory>\n"
+        h += "<index>#{@index_path}</index>\n"
+        h += "<count>#{@count}</count>\n"
+        unless @baseline.nil?
+          h += "<baseline>#{@baseline}</baseline>\n" 
+        else
+          h += "<rule>#{@rule}</rule>\n"
+        end
+        h += "<queryOffset>#{@offset}</queryOffset>\n"
+        h += "<runID>#{@run_id}</runID>\n"
+        h += "<printQuery>#{@print_query}</printQuery>\n"
+        h += "<printDocuments>#{@print_docs}</printDocuments>\n"
+
+        h
+      end
+ + diff --git a/doc/classes/RIR/TreeTagger.html b/doc/classes/RIR/TreeTagger.html new file mode 100644 index 0000000..a46f3bd --- /dev/null +++ b/doc/classes/RIR/TreeTagger.html @@ -0,0 +1,123 @@ + + + + Module: RIR::TreeTagger [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + +
ModuleRIR::TreeTagger
In: + + + + + lib/rir/ttagger.rb + + + + +
+ +
+
+ + +
+ + + + + + + + + diff --git a/doc/classes/RIR/TreeTagger/Chunk.html b/doc/classes/RIR/TreeTagger/Chunk.html new file mode 100644 index 0000000..95f4e8c --- /dev/null +++ b/doc/classes/RIR/TreeTagger/Chunk.html @@ -0,0 +1,187 @@ + + + + Class: RIR::TreeTagger::Chunk [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + + + + + + +
ClassRIR::TreeTagger::Chunk
In: + + + + + lib/rir/ttagger.rb + + + + +
+ +
Parent: + + Object + +
+
+ + +
+ +
+ +
+

+Represents a Chunk extracted when parsing a TaggerChunker file. +

+ +
+ +
+ + +
+

Methods

+ +
+ + new   + +
+
+ +
+ + + +
+ + + +
+

Attributes

+ +
+ + + + + + + + + + + + + + + + + + +
tag [R] 
words [R] 
+
+
+ + + + +
+ +

Public Class methods

+ + +
+ + + + +
+ +

+str are whitespace-separated terms. tag see : ftp.ims.uni-stuttgart.de/pub/corpora/chunker-tagset-english.txt +

+ +
+
+ + + +
+ + + + +
+ + + + + diff --git a/doc/classes/RIR/TreeTagger/Chunk.src/M000003.html b/doc/classes/RIR/TreeTagger/Chunk.src/M000003.html new file mode 100644 index 0000000..239dc5a --- /dev/null +++ b/doc/classes/RIR/TreeTagger/Chunk.src/M000003.html @@ -0,0 +1,16 @@ + + + + new (RIR::TreeTagger::Chunk) + + + + +
# File lib/rir/ttagger.rb, line 86
+      def initialize str,tag
+        @words = str.split
+        @tag   = tag[1..-2]
+      end
+ + diff --git a/doc/classes/RIR/TreeTagger/TaggerChunker.html b/doc/classes/RIR/TreeTagger/TaggerChunker.html new file mode 100644 index 0000000..57ae1ae --- /dev/null +++ b/doc/classes/RIR/TreeTagger/TaggerChunker.html @@ -0,0 +1,216 @@ + + + + Class: RIR::TreeTagger::TaggerChunker [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + + + + + + +
ClassRIR::TreeTagger::TaggerChunker
In: + + + + + lib/rir/ttagger.rb + + + + +
+ +
Parent: + + Object + +
+
+ + +
+ +
+ +
+

+This class handles generic parsing of tagger-chunker outputs. +

+ +
+ +
+ + +
+

Methods

+ +
+ + new   + + parse   + +
+
+ +
+ + + +
+ + + +
+

Attributes

+ +
+ + + + + + + + + + + + + + + + + + +
chunks [R] 
file [R] 
+
+
+ + + + +
+ +

Public Class methods

+ + +
+ + + + +
+ +

+Initializes parsing. chunk_file is the output of +tagger-chunker- and must be a valid path to the file. +

+
+  TaggerChunker.new("ttout/2010020") #=> #<RIR::TreeTagger::TaggerChunker:0x92fd088 @chunks=[#<RIR::TreeTagger::Chunk:0x8ec5a10 @words=["robert", "schumann"], @tag="NC">, ...] ...>
+
+ +
+
+ + +
+ + + + +
+ +

+Parses a tagger-chunker output and returns an Array of Chunk. +

+ +
+
+ + + +
+ + + + +
+ + + + + diff --git a/doc/classes/RIR/TreeTagger/TaggerChunker.src/M000001.html b/doc/classes/RIR/TreeTagger/TaggerChunker.src/M000001.html new file mode 100644 index 0000000..3bdb228 --- /dev/null +++ b/doc/classes/RIR/TreeTagger/TaggerChunker.src/M000001.html @@ -0,0 +1,39 @@ + + + + parse (RIR::TreeTagger::TaggerChunker) + + + + +
# File lib/rir/ttagger.rb, line 33
+      def self.parse chunk_lines
+        open = false
+        tag  = nil
+
+        chunks = []
+        words  = []
+
+        chunk_lines.each do |l|
+          l.chomp!
+          if l =~ /^<\w+>$/
+            open = true
+            tag  = l
+          elsif l =~ /^<\/\w+>$/
+            if !words.empty? && open && l == tag.sub(/</, '</')
+              open = false
+              chunks.push Chunk.new(words.join(" "), tag) 
+              words.clear
+            else
+              next
+            end
+          else
+            words.push(l.split.first)
+          end
+        end
+
+        chunks
+      end
+ + diff --git a/doc/classes/RIR/TreeTagger/TaggerChunker.src/M000002.html b/doc/classes/RIR/TreeTagger/TaggerChunker.src/M000002.html new file mode 100644 index 0000000..c33487c --- /dev/null +++ b/doc/classes/RIR/TreeTagger/TaggerChunker.src/M000002.html @@ -0,0 +1,15 @@ + + + + new (RIR::TreeTagger::TaggerChunker) + + + + +
# File lib/rir/ttagger.rb, line 65
+      def initialize chunk_file
+        @chunks = TaggerChunker.parse File.open(chunk_file).readlines
+      end
+ + diff --git a/doc/classes/RIR/TreeTagger/TaggerChunkerEnglish.html b/doc/classes/RIR/TreeTagger/TaggerChunkerEnglish.html new file mode 100644 index 0000000..f5a878f --- /dev/null +++ b/doc/classes/RIR/TreeTagger/TaggerChunkerEnglish.html @@ -0,0 +1,114 @@ + + + + Class: RIR::TreeTagger::TaggerChunkerEnglish [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + + + + + + +
ClassRIR::TreeTagger::TaggerChunkerEnglish
In: + + + + + lib/rir/ttagger.rb + + + + +
+ +
Parent: + + + + RIR::TreeTagger::TaggerChunker + + + +
+
+ + +
+ +
+ +
+ + +
+ + + +
+ + + + + + + + + +
+ + + + + diff --git a/doc/classes/RIR/TreeTagger/TaggerChunkerFrench.html b/doc/classes/RIR/TreeTagger/TaggerChunkerFrench.html new file mode 100644 index 0000000..203b487 --- /dev/null +++ b/doc/classes/RIR/TreeTagger/TaggerChunkerFrench.html @@ -0,0 +1,114 @@ + + + + Class: RIR::TreeTagger::TaggerChunkerFrench [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + + + + + + +
ClassRIR::TreeTagger::TaggerChunkerFrench
In: + + + + + lib/rir/ttagger.rb + + + + +
+ +
Parent: + + + + RIR::TreeTagger::TaggerChunker + + + +
+
+ + +
+ +
+ +
+ + +
+ + + +
+ + + + + + + + + +
+ + + + + diff --git a/doc/classes/RIR/TreeTagger/TaggerChunkerGerman.html b/doc/classes/RIR/TreeTagger/TaggerChunkerGerman.html new file mode 100644 index 0000000..fd7e410 --- /dev/null +++ b/doc/classes/RIR/TreeTagger/TaggerChunkerGerman.html @@ -0,0 +1,114 @@ + + + + Class: RIR::TreeTagger::TaggerChunkerGerman [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + + + + + + +
ClassRIR::TreeTagger::TaggerChunkerGerman
In: + + + + + lib/rir/ttagger.rb + + + + +
+ +
Parent: + + + + RIR::TreeTagger::TaggerChunker + + + +
+
+ + +
+ +
+ +
+ + +
+ + + +
+ + + + + + + + + +
+ + + + + diff --git a/doc/classes/RIR/WebDocument.html b/doc/classes/RIR/WebDocument.html index 75a2a8d..f2e6c6a 100644 --- a/doc/classes/RIR/WebDocument.html +++ b/doc/classes/RIR/WebDocument.html @@ -103,9 +103,9 @@ href="Document.html">Document with a url.
- get_content   + get_content   - new   + new  
@@ -144,13 +144,13 @@ href="Document.html">Document with a url.

Public Class methods

-
- +
+ -
- +
+ @@ -128,13 +128,13 @@ href="WebDocument.html">WebDocument.

Public Class methods

-
- +
+ -
- +
+ -
- +
+
- + search_wikipedia_titles(name) diff --git a/doc/classes/RIR/WikipediaPage.src/M000016.html b/doc/classes/RIR/WikipediaPage.src/M000016.html new file mode 100644 index 0000000..eb3518e --- /dev/null +++ b/doc/classes/RIR/WikipediaPage.src/M000016.html @@ -0,0 +1,19 @@ + + + + search_wikipedia_titles (RIR::WikipediaPage) + + + + +
# File lib/rir/document.rb, line 132
+    def self.search_wikipedia_titles(name)
+      raise ArgumentError, "Bad encoding", name unless name.isutf8
+
+      res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search']
+
+      res.collect { |e| e.attributes['title'] } unless res.nil?
+    end
+ + diff --git a/doc/classes/RIR/WikipediaPage.src/M000017.html b/doc/classes/RIR/WikipediaPage.src/M000017.html new file mode 100644 index 0000000..0b6b98b --- /dev/null +++ b/doc/classes/RIR/WikipediaPage.src/M000017.html @@ -0,0 +1,19 @@ + + + + get_url (RIR::WikipediaPage) + + + + +
# File lib/rir/document.rb, line 140
+    def self.get_url(name)
+      raise ArgumentError, "Bad encoding", name unless name.isutf8
+
+      atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes
+
+      atts['fullurl'] if atts['missing'].nil?
+    end
+ + diff --git a/doc/classes/RIR/WikipediaPage.src/M000018.html b/doc/classes/RIR/WikipediaPage.src/M000018.html new file mode 100644 index 0000000..d93d8db --- /dev/null +++ b/doc/classes/RIR/WikipediaPage.src/M000018.html @@ -0,0 +1,17 @@ + + + + search_homepage (RIR::WikipediaPage) + + + + +
# File lib/rir/document.rb, line 148
+    def self.search_homepage(name)
+      title = WikipediaPage.search_wikipedia_titles name
+
+      WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?
+    end
+ + diff --git a/doc/created.rid b/doc/created.rid index 5d2582c..5367b38 100644 --- a/doc/created.rid +++ b/doc/created.rid @@ -1 +1 @@ -Tue, 23 Nov 2010 18:20:46 +0100 +Thu, 25 Nov 2010 17:01:52 +0100 diff --git a/doc/files/lib/rir/document_rb.html b/doc/files/lib/rir/document_rb.html index 767c904..d5a6ac5 100644 --- a/doc/files/lib/rir/document_rb.html +++ b/doc/files/lib/rir/document_rb.html @@ -53,7 +53,7 @@ Last Update: - 2010-11-23 18:14:13 +0100 + 2010-11-25 16:04:20 +0100
diff --git a/doc/files/lib/rir/query_rb.html b/doc/files/lib/rir/query_rb.html index 2868c72..a2db500 100644 --- a/doc/files/lib/rir/query_rb.html +++ b/doc/files/lib/rir/query_rb.html @@ -53,7 +53,7 @@ Last Update: - 2010-11-23 18:20:30 +0100 + 2010-11-25 13:25:18 +0100
diff --git a/doc/files/lib/rir/ttagger_rb.html b/doc/files/lib/rir/ttagger_rb.html new file mode 100644 index 0000000..67b5fa6 --- /dev/null +++ b/doc/files/lib/rir/ttagger_rb.html @@ -0,0 +1,143 @@ + + + + File: ttagger.rb [RDoc Documentation] + + + + + + + + + +
+

ttagger.rb

+ + + + + + + + + +
Path:lib/rir/ttagger.rb + +
Last Update:2010-11-25 17:01:46 +0100
+
+ + +
+ +
+ +
+

+This file is a part of an Information Retrieval oriented Ruby library +

+

+Copyright (C) 2010-2011 Romain Deveaud +

+

+This program is free software: you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or (at your option) +any later version. +

+

+This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +more details. +

+

+You should have received a copy of the GNU General Public License along +with this program. If not, see <www.gnu.org/licenses/>. +

+

+This file is a part of an Information Retrieval oriented Ruby library +

+

+Copyright (C) 2010-2011 Romain Deveaud +

+

+This program is free software: you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or (at your option) +any later version. +

+

+This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +more details. +

+

+You should have received a copy of the GNU General Public License along +with this program. If not, see <www.gnu.org/licenses/>. +

+

+General module for many purposes related to Information Retrieval. +

+ +
+ +
+ + +
+ + + +
+ + + + + + + + + +
+ + + + + diff --git a/doc/files/lib/rir_rb.html b/doc/files/lib/rir_rb.html index d43a4b6..6486ffa 100644 --- a/doc/files/lib/rir_rb.html +++ b/doc/files/lib/rir_rb.html @@ -53,7 +53,7 @@ Last Update: - 2010-11-19 11:27:16 +0100 + 2010-11-25 15:44:52 +0100
@@ -78,6 +78,8 @@ rir/regexp   + rir/ttagger   +
diff --git a/doc/fr_class_index.html b/doc/fr_class_index.html index 9a24111..36ac9d8 100644 --- a/doc/fr_class_index.html +++ b/doc/fr_class_index.html @@ -19,7 +19,7 @@ RIR
- RIR::Corpus
+ RIR::Document
RIR::Indri
@@ -29,7 +29,21 @@ RIR::Query
- String
+ RIR::TreeTagger
+ + RIR::TreeTagger::Chunk
+ + RIR::TreeTagger::TaggerChunker
+ + RIR::TreeTagger::TaggerChunkerEnglish
+ + RIR::TreeTagger::TaggerChunkerFrench
+ + RIR::TreeTagger::TaggerChunkerGerman
+ + RIR::WebDocument
+ + RIR::WikipediaPage
diff --git a/doc/fr_file_index.html b/doc/fr_file_index.html index 8871047..6d7fbed 100644 --- a/doc/fr_file_index.html +++ b/doc/fr_file_index.html @@ -17,11 +17,13 @@

Files

diff --git a/doc/fr_method_index.html b/doc/fr_method_index.html index c909673..368ae37 100644 --- a/doc/fr_method_index.html +++ b/doc/fr_method_index.html @@ -17,39 +17,41 @@

Methods

diff --git a/doc/index.html b/doc/index.html index dcf5a4f..4b44566 100644 --- a/doc/index.html +++ b/doc/index.html @@ -16,6 +16,6 @@ - + diff --git a/lib/rir.rb b/lib/rir.rb index 0b27852..e21e097 100644 --- a/lib/rir.rb +++ b/lib/rir.rb @@ -5,3 +5,4 @@ require 'rir/string' require 'rir/query' require 'rir/corpus' require 'rir/regexp' +require 'rir/ttagger' diff --git a/lib/rir/document.rb b/lib/rir/document.rb index 5bda4e1..e5f69aa 100644 --- a/lib/rir/document.rb +++ b/lib/rir/document.rb @@ -73,7 +73,8 @@ module RIR # If the string parameter is composed of many words (i.e. tokens separated # by whitespace(s)), it is considered as an ngram. # - # entropy("guitar") #=> 0.00389919463243839 + # entropy("guitar") #=> 0.00432114812727959 + # entropy("dillinger escape plan") #=> 0.265862076325102 def entropy(s) en = 0.0 counts = self.count_words @@ -87,6 +88,12 @@ module RIR en end + # Computes the term frequency of a given *word* +s+. + # + # tf("guitar") #=> 0.000380372765310004 + def tf(s) + self.count_words[s].to_f/@words.size.to_f + end def initialize(content) @@ -123,13 +130,17 @@ module RIR def self.search_wikipedia_titles(name) - res = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml")).toutf8).elements['api/query/search'] + raise ArgumentError, "Bad encoding", name unless name.isutf8 + + res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search'] res.collect { |e| e.attributes['title'] } unless res.nil? end def self.get_url(name) - atts = REXML::Document.new(Net::HTTP.get(URI.parse("http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml")).toutf8).elements['api/query/pages/page'].attributes + raise ArgumentError, "Bad encoding", name unless name.isutf8 + + atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes atts['fullurl'] if atts['missing'].nil? end @@ -137,11 +148,7 @@ module RIR def self.search_homepage(name) title = WikipediaPage.search_wikipedia_titles name - begin - WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? - rescue - puts title[0] - end + WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? end # def initialize(name) diff --git a/lib/rir/query.rb b/lib/rir/query.rb index d18e297..dbff657 100644 --- a/lib/rir/query.rb +++ b/lib/rir/query.rb @@ -25,10 +25,10 @@ module RIR module Indri class Parameters - attr_accessor :corpus, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline + attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false) - @corpus = corpus + @index_path = corpus @memory = mem @count = count @offset = offset @@ -40,7 +40,7 @@ module RIR def to_s h = "\n" h += "#{@memory}\n" - h += "#{@corpus}\n" + h += "#{@index_path}\n" h += "#{@count}\n" unless @baseline.nil? h += "#{@baseline}\n" diff --git a/lib/rir/ttagger.rb b/lib/rir/ttagger.rb new file mode 100644 index 0000000..e1f2bd6 --- /dev/null +++ b/lib/rir/ttagger.rb @@ -0,0 +1,93 @@ +#!/usr/bin/env ruby + +# This file is a part of an Information Retrieval oriented Ruby library +# +# Copyright (C) 2010-2011 Romain Deveaud +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +module RIR + + # TreeTagger-related stuff module. + # + # See http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/DecisionTreeTagger.html + module TreeTagger + + # This class handles generic parsing of tagger-chunker outputs. + class TaggerChunker + attr_reader :chunks, :file + + + # Parses a tagger-chunker output and returns an Array of Chunk. + def self.parse chunk_lines + open = false + tag = nil + + chunks = [] + words = [] + + chunk_lines.each do |l| + l.chomp! + if l =~ /^<\w+>$/ + open = true + tag = l + elsif l =~ /^<\/\w+>$/ + if !words.empty? && open && l == tag.sub(/ #, ...] ...> + def initialize chunk_file + @chunks = TaggerChunker.parse File.open(chunk_file).readlines + end + + end + + class TaggerChunkerEnglish < TaggerChunker + end + + class TaggerChunkerFrench < TaggerChunker + end + + class TaggerChunkerGerman < TaggerChunker + end + + # Represents a Chunk extracted when parsing a TaggerChunker file. + class Chunk + attr_reader :words, :tag + + # +str+ are whitespace-separated terms. + # +tag+ see : ftp://ftp.ims.uni-stuttgart.de/pub/corpora/chunker-tagset-english.txt + def initialize str,tag + @words = str.split + @tag = tag[1..-2] + end + end + + end +end diff --git a/main.rb b/main.rb index 87408f7..78d5b8d 100644 --- a/main.rb +++ b/main.rb @@ -3,4 +3,5 @@ $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib")) require 'rir' w = RIR::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan") -p w.entropy("guitar") +p w.entropy("dillinger escape plan") +p w.tf("guitar")