From 7043da90bf781276184a770f306cfe7b59c17d5a Mon Sep 17 00:00:00 2001 From: Romain Deveaud Date: Fri, 5 Nov 2010 14:47:53 +0100 Subject: [PATCH] first commit --- README.markdown | 19 ++ doc/classes/Rir.html | 153 ++++++++++ doc/classes/Rir/Document.html | 312 +++++++++++++++++++++ doc/classes/Rir/Document.src/M000010.html | 23 ++ doc/classes/Rir/Document.src/M000011.html | 26 ++ doc/classes/Rir/Document.src/M000012.html | 18 ++ doc/classes/Rir/Document.src/M000013.html | 24 ++ doc/classes/Rir/Document.src/M000014.html | 16 ++ doc/classes/Rir/WebDocument.html | 209 ++++++++++++++ doc/classes/Rir/WebDocument.src/M000015.html | 16 ++ doc/classes/Rir/WebDocument.src/M000016.html | 16 ++ doc/classes/Rir/WikipediaPage.html | 122 ++++++++ doc/classes/String.html | 404 +++++++++++++++++++++++++++ doc/classes/String.src/M000001.html | 15 + doc/classes/String.src/M000002.html | 15 + doc/classes/String.src/M000003.html | 15 + doc/classes/String.src/M000004.html | 15 + doc/classes/String.src/M000005.html | 15 + doc/classes/String.src/M000006.html | 15 + doc/classes/String.src/M000007.html | 16 ++ doc/classes/String.src/M000008.html | 15 + doc/classes/String.src/M000009.html | 15 + doc/created.rid | 1 + doc/files/README_markdown.html | 90 ++++++ doc/files/lib/rir/document_rb.html | 127 +++++++++ doc/files/lib/rir/string_rb.html | 129 +++++++++ doc/files/lib/rir_rb.html | 102 +++++++ doc/files/main_rb.html | 100 +++++++ doc/fr_class_index.html | 33 +++ doc/fr_file_index.html | 33 +++ doc/fr_method_index.html | 55 ++++ doc/index.html | 21 ++ doc/rdoc-style.css | 299 ++++++++++++++++++++ lib/rir.rb | 4 + lib/rir/document.rb | 121 ++++++++ lib/rir/string.rb | 155 ++++++++++ main.rb | 3 + 37 files changed, 2767 insertions(+) create mode 100644 README.markdown create mode 100644 doc/classes/Rir.html create mode 100644 doc/classes/Rir/Document.html create mode 100644 doc/classes/Rir/Document.src/M000010.html create mode 100644 doc/classes/Rir/Document.src/M000011.html create mode 100644 doc/classes/Rir/Document.src/M000012.html create mode 100644 doc/classes/Rir/Document.src/M000013.html create mode 100644 doc/classes/Rir/Document.src/M000014.html create mode 100644 doc/classes/Rir/WebDocument.html create mode 100644 doc/classes/Rir/WebDocument.src/M000015.html create mode 100644 doc/classes/Rir/WebDocument.src/M000016.html create mode 100644 doc/classes/Rir/WikipediaPage.html create mode 100644 doc/classes/String.html create mode 100644 doc/classes/String.src/M000001.html create mode 100644 doc/classes/String.src/M000002.html create mode 100644 doc/classes/String.src/M000003.html create mode 100644 doc/classes/String.src/M000004.html create mode 100644 doc/classes/String.src/M000005.html create mode 100644 doc/classes/String.src/M000006.html create mode 100644 doc/classes/String.src/M000007.html create mode 100644 doc/classes/String.src/M000008.html create mode 100644 doc/classes/String.src/M000009.html create mode 100644 doc/created.rid create mode 100644 doc/files/README_markdown.html create mode 100644 doc/files/lib/rir/document_rb.html create mode 100644 doc/files/lib/rir/string_rb.html create mode 100644 doc/files/lib/rir_rb.html create mode 100644 doc/files/main_rb.html create mode 100644 doc/fr_class_index.html create mode 100644 doc/fr_file_index.html create mode 100644 doc/fr_method_index.html create mode 100644 doc/index.html create mode 100644 doc/rdoc-style.css create mode 100644 lib/rir.rb create mode 100644 lib/rir/document.rb create mode 100644 lib/rir/string.rb create mode 100644 main.rb diff --git a/README.markdown b/README.markdown new file mode 100644 index 0000000..e61112c --- /dev/null +++ b/README.markdown @@ -0,0 +1,19 @@ +# Ruby Information Retrieval (rIR) + +Copyright (C) 2010-2011 Romain Deveaud + +License +======= + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see . diff --git a/doc/classes/Rir.html b/doc/classes/Rir.html new file mode 100644 index 0000000..f3b2275 --- /dev/null +++ b/doc/classes/Rir.html @@ -0,0 +1,153 @@ + + + + Module: Rir [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + +
ModuleRir
In: + + + + + lib/rir/string.rb + + + + +
+ + + + + lib/rir/document.rb + + + + +
+ +
+
+ + +
+ +
+ +
+

+General module for many purposes related to Information Retrieval. +

+

+General module for many purposes related to Information Retrieval. +

+ +
+ +
+ + +
+ + + +
+ +
+

Classes and Modules

+ + Class Rir::Document
+Class Rir::WebDocument
+Class Rir::WikipediaPage
+ +
+ +
+

Constants

+ +
+ + + + + + + + + + + + +
Stoplist=[ "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu", "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during", "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting", "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff", "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore", "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he", "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto", "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto", "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include", "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into", "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last", "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe", "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs", "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless", "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing", "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite", "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt", "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave", "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them", "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts", "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru", "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh", "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward", "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week", "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore", "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever", "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom", "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within", "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your", "yours", "yourself", "yourselves" ]  +These are the default stopwords provided by Lemur. + +
+
+
+ + + + + + + + + +
+ + + + + diff --git a/doc/classes/Rir/Document.html b/doc/classes/Rir/Document.html new file mode 100644 index 0000000..9f6f91d --- /dev/null +++ b/doc/classes/Rir/Document.html @@ -0,0 +1,312 @@ + + + + Class: Rir::Document [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + + + + + + +
ClassRir::Document
In: + + + + + lib/rir/document.rb + + + + +
+ +
Parent: + + Object + +
+
+ + +
+ +
+ +
+

+A Document is a bag of words and is constructed +from a string. +

+ +
+ +
+ + +
+

Methods

+ +
+ + count_words   + + entropy   + + format_words   + + new   + + ngrams   + +
+
+ +
+ + + +
+ + + +
+

Attributes

+ +
+ + + + + + + + + + + + + + + + + + +
doc_content [R] 
words [R] 
+
+
+ + + + +
+ +

Public Class methods

+ + +
+ + + + +
+ +
+
+ + +

Public Instance methods

+ + +
+ + + + +
+ +

+Returns a Hash containing the words and their associated counts in the +current Document. +

+
+  count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
+
+ +
+
+ + +
+ + + + +
+ +

+Computes the entropy of a given string s inside the document. +

+

+If the string parameter is composed of many words (i.e. tokens separated by +whitespace(s)), it is considered as an ngram. +

+
+  entropy("guitar") #=> 0.00389919463243839
+
+ +
+
+ + +
+ + + + +
+ +

+Returns an Array containing the n-grams (words) from the current +Document. +

+
+  ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
+
+ +
+
+ + +

Protected Instance methods

+ + +
+ + + + +
+ +

+Any non-word characters are removed from the words (see perldoc.perl.org/perlre.html +and the W special escape). +

+

+Protected function, only meant to by called at the initialization. +

+ +
+
+ + + +
+ + + + +
+ + + + + diff --git a/doc/classes/Rir/Document.src/M000010.html b/doc/classes/Rir/Document.src/M000010.html new file mode 100644 index 0000000..a030e0e --- /dev/null +++ b/doc/classes/Rir/Document.src/M000010.html @@ -0,0 +1,23 @@ + + + + format_words (Rir::Document) + + + + +
# File lib/rir/document.rb, line 31
+    def format_words
+      wo = []
+
+      @doc_content.split.each do |w|
+        w.split(/\W/).each do |sw| 
+          wo.push(sw) if sw =~ /[a-zA-Z]/ 
+        end
+      end
+      
+      wo
+    end
+ + diff --git a/doc/classes/Rir/Document.src/M000011.html b/doc/classes/Rir/Document.src/M000011.html new file mode 100644 index 0000000..e12fd03 --- /dev/null +++ b/doc/classes/Rir/Document.src/M000011.html @@ -0,0 +1,26 @@ + + + + ngrams (Rir::Document) + + + + +
# File lib/rir/document.rb, line 46
+    def ngrams(n)
+      window       = []
+      ngrams_array = []
+
+      @words.each do |w|
+        window.push(w)
+        if window.size == n
+          ngrams_array.push window.join(" ")
+          window.delete_at(0)
+        end
+      end
+
+      ngrams_array.uniq
+    end
+ + diff --git a/doc/classes/Rir/Document.src/M000012.html b/doc/classes/Rir/Document.src/M000012.html new file mode 100644 index 0000000..9f23ad9 --- /dev/null +++ b/doc/classes/Rir/Document.src/M000012.html @@ -0,0 +1,18 @@ + + + + count_words (Rir::Document) + + + + +
# File lib/rir/document.rb, line 64
+    def count_words
+      counts = Hash.new { |h,k| h[k] = 0 }
+      @words.each { |w| counts[w.downcase] += 1 }
+
+      counts
+    end
+ + diff --git a/doc/classes/Rir/Document.src/M000013.html b/doc/classes/Rir/Document.src/M000013.html new file mode 100644 index 0000000..986eab3 --- /dev/null +++ b/doc/classes/Rir/Document.src/M000013.html @@ -0,0 +1,24 @@ + + + + entropy (Rir::Document) + + + + +
# File lib/rir/document.rb, line 77
+    def entropy(s)
+      en = 0.0
+      counts = self.count_words
+
+      s.split.each do |w|
+        p_wi = counts[w].to_f/@words.count.to_f
+        en += p_wi*Math.log2(p_wi)
+      end
+
+      en *= -1
+      en
+    end
+ + diff --git a/doc/classes/Rir/Document.src/M000014.html b/doc/classes/Rir/Document.src/M000014.html new file mode 100644 index 0000000..8c644b0 --- /dev/null +++ b/doc/classes/Rir/Document.src/M000014.html @@ -0,0 +1,16 @@ + + + + new (Rir::Document) + + + + +
# File lib/rir/document.rb, line 92
+    def initialize(content)
+      @doc_content = content
+      @words = format_words
+    end
+ + diff --git a/doc/classes/Rir/WebDocument.html b/doc/classes/Rir/WebDocument.html new file mode 100644 index 0000000..35e437f --- /dev/null +++ b/doc/classes/Rir/WebDocument.html @@ -0,0 +1,209 @@ + + + + Class: Rir::WebDocument [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + + + + + + +
ClassRir::WebDocument
In: + + + + + lib/rir/document.rb + + + + +
+ +
Parent: + + + + Rir::Document + + + +
+
+ + +
+ +
+ +
+

+A WebDocument is a Document with a url. +

+ +
+ +
+ + +
+

Methods

+ +
+ + get_content   + + new   + +
+
+ +
+ + + +
+ + + +
+

Attributes

+ +
+ + + + + + + + + + +
url [R] 
+
+
+ + + + +
+ +

Public Class methods

+ + +
+ + + + +
+ +

+Returns the HTML text from the page of a given url. +

+ +
+
+ + +
+ + + + +
+ +

+WebDocument constructor, the content of the +Document is the HTML page without the tags. +

+ +
+
+ + + +
+ + + + +
+ + + + + diff --git a/doc/classes/Rir/WebDocument.src/M000015.html b/doc/classes/Rir/WebDocument.src/M000015.html new file mode 100644 index 0000000..31f8332 --- /dev/null +++ b/doc/classes/Rir/WebDocument.src/M000015.html @@ -0,0 +1,16 @@ + + + + get_content (Rir::WebDocument) + + + + +
# File lib/rir/document.rb, line 105
+    def self.get_content(url)
+      require 'net/http'
+      Net::HTTP.get(URI.parse(url))
+    end
+ + diff --git a/doc/classes/Rir/WebDocument.src/M000016.html b/doc/classes/Rir/WebDocument.src/M000016.html new file mode 100644 index 0000000..1186ae7 --- /dev/null +++ b/doc/classes/Rir/WebDocument.src/M000016.html @@ -0,0 +1,16 @@ + + + + new (Rir::WebDocument) + + + + +
# File lib/rir/document.rb, line 112
+    def initialize(url)
+      @url = url
+      super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags
+    end
+ + diff --git a/doc/classes/Rir/WikipediaPage.html b/doc/classes/Rir/WikipediaPage.html new file mode 100644 index 0000000..0824f45 --- /dev/null +++ b/doc/classes/Rir/WikipediaPage.html @@ -0,0 +1,122 @@ + + + + Class: Rir::WikipediaPage [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + + + + + + +
ClassRir::WikipediaPage
In: + + + + + lib/rir/document.rb + + + + +
+ +
Parent: + + + + Rir::WebDocument + + + +
+
+ + +
+ +
+ +
+

+A WikipediaPage is a WebDocument. +

+ +
+ +
+ + +
+ + + +
+ + + + + + + + + +
+ + + + + diff --git a/doc/classes/String.html b/doc/classes/String.html new file mode 100644 index 0000000..3ed6a1d --- /dev/null +++ b/doc/classes/String.html @@ -0,0 +1,404 @@ + + + + Class: String [RDoc Documentation] + + + + + + + + + +
+ + + + + + + + + + + + + + + + +
ClassString
In: + + + + + lib/rir/string.rb + + + + +
+ +
Parent: + + Object + +
+
+ + +
+ +
+ +
+

+Extention of the standard class String with +useful function. +

+ +
+ +
+ + + + +
+ + + +
+

Included Modules

+ +
+ + Rir + +
+
+ +
+ + + + + + +
+ +

Public Instance methods

+ + +
+ + + + +
+ +

+Returns the text values inside all occurences of a XML tag in self +

+
+  s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre"
+  s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"]
+
+ +
+
+ + +
+ + + + +
+ +

+Returns true if self belongs to Rir::Stoplist, +false otherwise. +

+ +
+
+ + +
+ + + + +
+ +

+Do not use. TODO: rewamp. find why this function is here. +

+ +
+
+ + +
+ + + + +
+ +

+Removes all Javascript sources from self. +

+
+  s = "<script type='text/javascript'>
+        var skin='vector',
+        stylepath='http://bits.wikimedia.org/skins-1.5'
+       </script>
+
+       test"
+  s.strip_javascripts                   #=> "test"
+
+ +
+
+ + +
+ + + + +
+ +

+Removes all Javascript sources from self. +

+
+  s = "<script type='text/javascript'>
+        var skin='vector',
+        stylepath='http://bits.wikimedia.org/skins-1.5'
+       </script>
+
+       test"
+  s.strip_javascripts!
+  s                                     #=> "test"
+
+ +
+
+ + +
+ + + + +
+ +
+
+ + +
+ + + + +
+ +
+
+ + +
+ + + + +
+ +

+Removes all XML-like tags from self. +

+
+  s = "<html><body>test</body></html>"
+  s.strip_xml_tags                      #=> "test"
+  s                                     #=> "<html><body>test</body></html>"
+
+ +
+
+ + +
+ + + + +
+ +

+Removes all XML-like tags from self. +

+
+  s = "<html><body>test</body></html>"
+  s.strip_xml_tags!
+  s                                     #=> "test"
+
+ +
+
+ + + +
+ + + + +
+ + + + + diff --git a/doc/classes/String.src/M000001.html b/doc/classes/String.src/M000001.html new file mode 100644 index 0000000..f96e8b1 --- /dev/null +++ b/doc/classes/String.src/M000001.html @@ -0,0 +1,15 @@ + + + + is_stopword? (String) + + + + +
# File lib/rir/string.rb, line 77
+  def is_stopword?
+    Stoplist.include?(self.downcase)
+  end
+ + diff --git a/doc/classes/String.src/M000002.html b/doc/classes/String.src/M000002.html new file mode 100644 index 0000000..1d3aa25 --- /dev/null +++ b/doc/classes/String.src/M000002.html @@ -0,0 +1,15 @@ + + + + remove_special_characters (String) + + + + +
# File lib/rir/string.rb, line 83
+  def remove_special_characters
+    self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ')
+  end
+ + diff --git a/doc/classes/String.src/M000003.html b/doc/classes/String.src/M000003.html new file mode 100644 index 0000000..21c6728 --- /dev/null +++ b/doc/classes/String.src/M000003.html @@ -0,0 +1,15 @@ + + + + strip_xml_tags! (String) + + + + +
# File lib/rir/string.rb, line 92
+  def strip_xml_tags!
+    replace strip_with_pattern /<\/?[^>]*>/
+  end
+ + diff --git a/doc/classes/String.src/M000004.html b/doc/classes/String.src/M000004.html new file mode 100644 index 0000000..a913161 --- /dev/null +++ b/doc/classes/String.src/M000004.html @@ -0,0 +1,15 @@ + + + + strip_xml_tags (String) + + + + +
# File lib/rir/string.rb, line 101
+  def strip_xml_tags
+    dup.strip_xml_tags!
+  end
+ + diff --git a/doc/classes/String.src/M000005.html b/doc/classes/String.src/M000005.html new file mode 100644 index 0000000..188323f --- /dev/null +++ b/doc/classes/String.src/M000005.html @@ -0,0 +1,15 @@ + + + + strip_javascripts! (String) + + + + +
# File lib/rir/string.rb, line 115
+  def strip_javascripts!
+    replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m 
+  end
+ + diff --git a/doc/classes/String.src/M000006.html b/doc/classes/String.src/M000006.html new file mode 100644 index 0000000..ad91df4 --- /dev/null +++ b/doc/classes/String.src/M000006.html @@ -0,0 +1,15 @@ + + + + strip_javascripts (String) + + + + +
# File lib/rir/string.rb, line 128
+  def strip_javascripts
+    dup.strip_javascripts!
+  end
+ + diff --git a/doc/classes/String.src/M000007.html b/doc/classes/String.src/M000007.html new file mode 100644 index 0000000..448264e --- /dev/null +++ b/doc/classes/String.src/M000007.html @@ -0,0 +1,16 @@ + + + + strip_stylesheets! (String) + + + + +
# File lib/rir/string.rb, line 132
+  def strip_stylesheets!
+  # TODO: rewamp. dunno what is it.
+    replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m 
+  end
+ + diff --git a/doc/classes/String.src/M000008.html b/doc/classes/String.src/M000008.html new file mode 100644 index 0000000..8a44d27 --- /dev/null +++ b/doc/classes/String.src/M000008.html @@ -0,0 +1,15 @@ + + + + strip_stylesheets (String) + + + + +
# File lib/rir/string.rb, line 137
+  def strip_stylesheets
+    dup.strip_stylesheets!
+  end
+ + diff --git a/doc/classes/String.src/M000009.html b/doc/classes/String.src/M000009.html new file mode 100644 index 0000000..2203bd0 --- /dev/null +++ b/doc/classes/String.src/M000009.html @@ -0,0 +1,15 @@ + + + + extract_xmltags_values (String) + + + + +
# File lib/rir/string.rb, line 145
+  def extract_xmltags_values(tag_name)
+    self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten
+  end
+ + diff --git a/doc/created.rid b/doc/created.rid new file mode 100644 index 0000000..3035b6c --- /dev/null +++ b/doc/created.rid @@ -0,0 +1 @@ +Fri, 05 Nov 2010 14:41:10 +0100 diff --git a/doc/files/README_markdown.html b/doc/files/README_markdown.html new file mode 100644 index 0000000..013aed3 --- /dev/null +++ b/doc/files/README_markdown.html @@ -0,0 +1,90 @@ + + + + File: README.markdown [RDoc Documentation] + + + + + + + + + +
+

README.markdown

+ + + + + + + + + +
Path:README.markdown + +
Last Update:2010-11-05 14:40:41 +0100
+
+ + +
+ +
+ +
+ + +
+ + + +
+ + + + + + + + + +
+ + + + + diff --git a/doc/files/lib/rir/document_rb.html b/doc/files/lib/rir/document_rb.html new file mode 100644 index 0000000..caddfbf --- /dev/null +++ b/doc/files/lib/rir/document_rb.html @@ -0,0 +1,127 @@ + + + + File: document.rb [RDoc Documentation] + + + + + + + + + +
+

document.rb

+ + + + + + + + + +
Path:lib/rir/document.rb + +
Last Update:2010-11-05 14:39:35 +0100
+
+ + +
+ +
+ +
+

+This file is a part of an Information Retrieval oriented Ruby library +

+

+Copyright (C) 2010-2011 Romain Deveaud +

+

+This program is free software: you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or (at your option) +any later version. +

+

+This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +more details. +

+

+You should have received a copy of the GNU General Public License along +with this program. If not, see <www.gnu.org/licenses/>. +

+ +
+ +
+

Required files

+ +
+ + net/http   + +
+
+ +
+ + +
+ + + +
+ + + + + + + + + +
+ + + + + diff --git a/doc/files/lib/rir/string_rb.html b/doc/files/lib/rir/string_rb.html new file mode 100644 index 0000000..961b94b --- /dev/null +++ b/doc/files/lib/rir/string_rb.html @@ -0,0 +1,129 @@ + + + + File: string.rb [RDoc Documentation] + + + + + + + + + +
+

string.rb

+ + + + + + + + + +
Path:lib/rir/string.rb + +
Last Update:2010-11-05 14:39:35 +0100
+
+ + +
+ +
+ +
+

+This file is a part of an Information Retrieval oriented Ruby library +

+

+Copyright (C) 2010-2011 Romain Deveaud +

+

+This program is free software: you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or (at your option) +any later version. +

+

+This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +more details. +

+

+You should have received a copy of the GNU General Public License along +with this program. If not, see <www.gnu.org/licenses/>. +

+ +
+ +
+

Required files

+ +
+ + cgi   + + kconv   + +
+
+ +
+ + +
+ + + +
+ + + + + + + + + +
+ + + + + diff --git a/doc/files/lib/rir_rb.html b/doc/files/lib/rir_rb.html new file mode 100644 index 0000000..3a8552d --- /dev/null +++ b/doc/files/lib/rir_rb.html @@ -0,0 +1,102 @@ + + + + File: rir.rb [RDoc Documentation] + + + + + + + + + +
+

rir.rb

+ + + + + + + + + +
Path:lib/rir.rb + +
Last Update:2010-11-05 14:39:35 +0100
+
+ + +
+ +
+ +
+

Required files

+ +
+ + rir/document   + + rir/string   + +
+
+ +
+ + +
+ + + +
+ + + + + + + + + +
+ + + + + diff --git a/doc/files/main_rb.html b/doc/files/main_rb.html new file mode 100644 index 0000000..3f0de48 --- /dev/null +++ b/doc/files/main_rb.html @@ -0,0 +1,100 @@ + + + + File: main.rb [RDoc Documentation] + + + + + + + + + +
+

main.rb

+ + + + + + + + + +
Path:main.rb + +
Last Update:2010-11-05 14:40:11 +0100
+
+ + +
+ +
+ +
+

Required files

+ +
+ + rir   + +
+
+ +
+ + +
+ + + +
+ + + + + + + + + +
+ + + + + diff --git a/doc/fr_class_index.html b/doc/fr_class_index.html new file mode 100644 index 0000000..095130d --- /dev/null +++ b/doc/fr_class_index.html @@ -0,0 +1,33 @@ + + + + + Classes [RDoc Documentation] + + + + + +
+

Classes

+ +
+ + diff --git a/doc/fr_file_index.html b/doc/fr_file_index.html new file mode 100644 index 0000000..6a45fa1 --- /dev/null +++ b/doc/fr_file_index.html @@ -0,0 +1,33 @@ + + + + + Files [RDoc Documentation] + + + + + + + + diff --git a/doc/fr_method_index.html b/doc/fr_method_index.html new file mode 100644 index 0000000..3b25a4f --- /dev/null +++ b/doc/fr_method_index.html @@ -0,0 +1,55 @@ + + + + + Methods [RDoc Documentation] + + + + + + + + diff --git a/doc/index.html b/doc/index.html new file mode 100644 index 0000000..ba843c5 --- /dev/null +++ b/doc/index.html @@ -0,0 +1,21 @@ + + + + + RDoc Documentation + + + + + + + + + + + diff --git a/doc/rdoc-style.css b/doc/rdoc-style.css new file mode 100644 index 0000000..cf8367d --- /dev/null +++ b/doc/rdoc-style.css @@ -0,0 +1,299 @@ +body { + font-family: Verdana,Arial,Helvetica,sans-serif; + font-size: 90%; + margin: 0; + margin-left: 40px; + padding: 0; + background: white; + color: black; +} + +h1, h2, h3, h4 { + margin: 0; + background: transparent; +} + +h1 { + font-size: 150%; +} + +h2,h3,h4 { + margin-top: 1em; +} + +:link, :visited { + background: #eef; + color: #039; + text-decoration: none; +} + +:link:hover, :visited:hover { + background: #039; + color: #eef; +} + +/* Override the base stylesheet's Anchor inside a table cell */ +td > :link, td > :visited { + background: transparent; + color: #039; + text-decoration: none; +} + +/* and inside a section title */ +.section-title > :link, .section-title > :visited { + background: transparent; + color: #eee; + text-decoration: none; +} + +/* === Structural elements =================================== */ + +.index { + margin: 0; + margin-left: -40px; + padding: 0; + font-size: 90%; +} + +.index :link, .index :visited { + margin-left: 0.7em; +} + +.index .section-bar { + margin-left: 0px; + padding-left: 0.7em; + background: #ccc; + font-size: small; +} + +#classHeader, #fileHeader { + width: auto; + color: white; + padding: 0.5em 1.5em 0.5em 1.5em; + margin: 0; + margin-left: -40px; + border-bottom: 3px solid #006; +} + +#classHeader :link, #fileHeader :link, +#classHeader :visited, #fileHeader :visited { + background: inherit; + color: white; +} + +#classHeader td, #fileHeader td { + background: inherit; + color: white; +} + +#fileHeader { + background: #057; +} + +#classHeader { + background: #048; +} + +.class-name-in-header { + font-size: 180%; + font-weight: bold; +} + +#bodyContent { + padding: 0 1.5em 0 1.5em; +} + +#description { + padding: 0.5em 1.5em; + background: #efefef; + border: 1px dotted #999; +} + +#description h1, #description h2, #description h3, +#description h4, #description h5, #description h6 { + color: #125; + background: transparent; +} + +#validator-badges { + text-align: center; +} + +#validator-badges img { + border: 0; +} + +#copyright { + color: #333; + background: #efefef; + font: 0.75em sans-serif; + margin-top: 5em; + margin-bottom: 0; + padding: 0.5em 2em; +} + +/* === Classes =================================== */ + +table.header-table { + color: white; + font-size: small; +} + +.type-note { + font-size: small; + color: #dedede; +} + +.section-bar { + color: #333; + border-bottom: 1px solid #999; + margin-left: -20px; +} + +.section-title { + background: #79a; + color: #eee; + padding: 3px; + margin-top: 2em; + margin-left: -30px; + border: 1px solid #999; +} + +.top-aligned-row { + vertical-align: top +} + +.bottom-aligned-row { + vertical-align: bottom +} + +#diagram img { + border: 0; +} + +/* --- Context section classes ----------------------- */ + +.context-row { } + +.context-item-name { + font-family: monospace; + font-weight: bold; + color: black; +} + +.context-item-value { + font-size: small; + color: #448; +} + +.context-item-desc { + color: #333; + padding-left: 2em; +} + +/* --- Method classes -------------------------- */ + +.method-detail { + background: #efefef; + padding: 0; + margin-top: 0.5em; + margin-bottom: 1em; + border: 1px dotted #ccc; +} + +.method-heading { + color: black; + background: #ccc; + border-bottom: 1px solid #666; + padding: 0.2em 0.5em 0 0.5em; +} + +.method-signature { + color: black; + background: inherit; +} + +.method-name { + font-weight: bold; +} + +.method-args { + font-style: italic; +} + +.method-description { + padding: 0 0.5em 0 0.5em; +} + +/* --- Source code sections -------------------- */ + +:link.source-toggle, :visited.source-toggle { + font-size: 90%; +} + +div.method-source-code { + background: #262626; + color: #ffdead; + margin: 1em; + padding: 0.5em; + border: 1px dashed #999; + overflow: auto; +} + +div.method-source-code pre { + color: #ffdead; +} + +/* --- Ruby keyword styles --------------------- */ + +.standalone-code { + background: #221111; + color: #ffdead; + overflow: auto; +} + +.ruby-constant { + color: #7fffd4; + background: transparent; +} + +.ruby-keyword { + color: #00ffff; + background: transparent; +} + +.ruby-ivar { + color: #eedd82; + background: transparent; +} + +.ruby-operator { + color: #00ffee; + background: transparent; +} + +.ruby-identifier { + color: #ffdead; + background: transparent; +} + +.ruby-node { + color: #ffa07a; + background: transparent; +} + +.ruby-comment { + color: #b22222; + font-weight: bold; + background: transparent; +} + +.ruby-regexp { + color: #ffa07a; + background: transparent; +} + +.ruby-value { + color: #7fffd4; + background: transparent; +} diff --git a/lib/rir.rb b/lib/rir.rb new file mode 100644 index 0000000..58b58ff --- /dev/null +++ b/lib/rir.rb @@ -0,0 +1,4 @@ +#!/usr/bin/env ruby + +require 'rir/document' +require 'rir/string' diff --git a/lib/rir/document.rb b/lib/rir/document.rb new file mode 100644 index 0000000..dc80db4 --- /dev/null +++ b/lib/rir/document.rb @@ -0,0 +1,121 @@ +#!/usr/bin/env ruby + +# This file is a part of an Information Retrieval oriented Ruby library +# +# Copyright (C) 2010-2011 Romain Deveaud +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# General module for many purposes related to Information Retrieval. +module Rir + + # A Document is a bag of words and is constructed from a string. + class Document + attr_reader :words, :doc_content + + # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html + # and the \\W special escape). + # + # Protected function, only meant to by called at the initialization. + def format_words + wo = [] + + @doc_content.split.each do |w| + w.split(/\W/).each do |sw| + wo.push(sw) if sw =~ /[a-zA-Z]/ + end + end + + wo + end + + # Returns an Array containing the +n+-grams (words) from the current Document. + # + # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] + def ngrams(n) + window = [] + ngrams_array = [] + + @words.each do |w| + window.push(w) + if window.size == n + ngrams_array.push window.join(" ") + window.delete_at(0) + end + end + + ngrams_array.uniq + end + + # Returns a Hash containing the words and their associated counts in the current Document. + # + # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } + def count_words + counts = Hash.new { |h,k| h[k] = 0 } + @words.each { |w| counts[w.downcase] += 1 } + + counts + end + + # Computes the entropy of a given string +s+ inside the document. + # + # If the string parameter is composed of many words (i.e. tokens separated + # by whitespace(s)), it is considered as an ngram. + # + # entropy("guitar") #=> 0.00389919463243839 + def entropy(s) + en = 0.0 + counts = self.count_words + + s.split.each do |w| + p_wi = counts[w].to_f/@words.count.to_f + en += p_wi*Math.log2(p_wi) + end + + en *= -1 + en + end + + + + def initialize(content) + @doc_content = content + @words = format_words + end + + protected :format_words + end + + # A WebDocument is a Document with a +url+. + class WebDocument < Document + attr_reader :url + + # Returns the HTML text from the page of a given +url+. + def self.get_content(url) + require 'net/http' + Net::HTTP.get(URI.parse(url)) + end + + # WebDocument constructor, the content of the Document is the HTML page + # without the tags. + def initialize(url) + @url = url + super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags + end + end + + # A WikipediaPage is a WebDocument. + class WikipediaPage < WebDocument + end +end diff --git a/lib/rir/string.rb b/lib/rir/string.rb new file mode 100644 index 0000000..250cc14 --- /dev/null +++ b/lib/rir/string.rb @@ -0,0 +1,155 @@ +#!/usr/bin/env ruby + +# This file is a part of an Information Retrieval oriented Ruby library +# +# Copyright (C) 2010-2011 Romain Deveaud +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# General module for many purposes related to Information Retrieval. +module Rir + + # These are the default stopwords provided by Lemur. + Stoplist = [ + "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av", + "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", + "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", + "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu", + "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during", + "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every", + "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting", + "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff", + "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore", + "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he", + "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto", + "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto", + "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include", + "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into", + "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last", + "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe", + "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs", + "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless", + "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing", + "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once", + "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", + "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite", + "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing", + "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt", + "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote", + "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", + "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave", + "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them", + "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts", + "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", + "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru", + "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh", + "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward", + "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week", + "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", + "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore", + "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", + "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever", + "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom", + "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within", + "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your", + "yours", "yourself", "yourselves" + ] + + +end + +# Extention of the standard class String with useful function. +class String + include Rir + + # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise. + def is_stopword? + Stoplist.include?(self.downcase) + end + + # Do not use. + # TODO: rewamp. find why this function is here. + def remove_special_characters + self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ') + end + + # Removes all XML-like tags from +self+. + # + # s = "test" + # s.strip_xml_tags! + # s #=> "test" + def strip_xml_tags! + replace strip_with_pattern /<\/?[^>]*>/ + end + + # Removes all XML-like tags from +self+. + # + # s = "test" + # s.strip_xml_tags #=> "test" + # s #=> "test" + def strip_xml_tags + dup.strip_xml_tags! + end + + # Removes all Javascript sources from +self+. + # + # s = " + # + # test" + # s.strip_javascripts! + # s #=> "test" + def strip_javascripts! + replace strip_with_pattern / + # + # test" + # s.strip_javascripts #=> "test" + def strip_javascripts + dup.strip_javascripts! + end + + def strip_stylesheets! + # TODO: rewamp. dunno what is it. + replace strip_with_pattern /