diff --git a/doc/classes/Mirimiri.html b/doc/classes/Mirimiri.html new file mode 100644 index 0000000..6e74cd8 --- /dev/null +++ b/doc/classes/Mirimiri.html @@ -0,0 +1,150 @@ + + +
+Module | +Mirimiri | +
In: | +
+
+
+
+
+ lib/mirimiri/string.rb
+
+
+
+
+ + + + + + lib/mirimiri/document.rb + + + + + + + |
+
+General module +
+ +Stoplist | += | +[ "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu", "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during", "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting", "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff", "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore", "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he", "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto", "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto", "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include", "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into", "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last", "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe", "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs", "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless", "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing", "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite", "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt", "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave", "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them", "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts", "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru", "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh", "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward", "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week", "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore", "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever", "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom", "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within", "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your", "yours", "yourself", "yourselves" ] | + ++ | +These are the default stopwords provided by Lemur. + + | + +
Class | +Mirimiri::Document | +
In: | +
+
+
+
+
+ lib/mirimiri/document.rb
+
+
+
+
+ + + |
+
Parent: | ++ + Object + + | +
+A Document is a bag of words and is constructed +from a string. +
+ +doc_content | + +[R] | + ++ |
words | + +[R] | + ++ |
+Returns a Hash containing the words and their associated counts in the +current Document. +
++ count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } ++ +
+Computes the entropy of a given string s inside the document. +
++If the string parameter is composed of many words (i.e. tokens separated by +whitespace(s)), it is considered as an ngram. +
++ entropy("guitar") #=> 0.00432114812727959 + entropy("dillinger escape plan") #=> 0.265862076325102 ++ +
+Returns an Array containing the n-grams (words) from the current +Document. +
++ ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] ++ +
+Computes the term frequency of a given word s. +
++ tf("guitar") #=> 0.000380372765310004 ++ +
+Any non-word characters are removed from the words (see perldoc.perl.org/perlre.html +and the W special escape). +
++Protected function, only meant to by called at the initialization. +
+ +# File lib/mirimiri/document.rb, line 34 + def format_words + wo = [] + + @doc_content.split.each do |w| + w.split(/\W/).each do |sw| + wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ + end + end + + wo + end+ + diff --git a/doc/classes/Mirimiri/Document.src/M000023.html b/doc/classes/Mirimiri/Document.src/M000023.html new file mode 100644 index 0000000..136a93e --- /dev/null +++ b/doc/classes/Mirimiri/Document.src/M000023.html @@ -0,0 +1,26 @@ + + + +
# File lib/mirimiri/document.rb, line 49 + def ngrams(n) + window = [] + ngrams_array = [] + + @words.each do |w| + window.push(w) + if window.size == n + ngrams_array.push window.join(" ") + window.delete_at(0) + end + end + + ngrams_array.uniq + end+ + diff --git a/doc/classes/Mirimiri/Document.src/M000024.html b/doc/classes/Mirimiri/Document.src/M000024.html new file mode 100644 index 0000000..df19d9f --- /dev/null +++ b/doc/classes/Mirimiri/Document.src/M000024.html @@ -0,0 +1,18 @@ + + + +
# File lib/mirimiri/document.rb, line 67 + def count_words + counts = Hash.new { |h,k| h[k] = 0 } + @words.each { |w| counts[w] += 1 } + + counts + end+ + diff --git a/doc/classes/Mirimiri/Document.src/M000025.html b/doc/classes/Mirimiri/Document.src/M000025.html new file mode 100644 index 0000000..b745028 --- /dev/null +++ b/doc/classes/Mirimiri/Document.src/M000025.html @@ -0,0 +1,24 @@ + + + +
# File lib/mirimiri/document.rb, line 81 + def entropy(s) + en = 0.0 + counts = self.count_words + + s.split.each do |w| + p_wi = counts[w].to_f/@words.count.to_f + en += p_wi*Math.log2(p_wi) + end + + en *= -1 + en + end+ + diff --git a/doc/classes/Mirimiri/Document.src/M000026.html b/doc/classes/Mirimiri/Document.src/M000026.html new file mode 100644 index 0000000..ab8eab7 --- /dev/null +++ b/doc/classes/Mirimiri/Document.src/M000026.html @@ -0,0 +1,15 @@ + + + +
# File lib/mirimiri/document.rb, line 97 + def tf(s) + self.count_words[s].to_f/@words.size.to_f + end+ + diff --git a/doc/classes/Mirimiri/Document.src/M000027.html b/doc/classes/Mirimiri/Document.src/M000027.html new file mode 100644 index 0000000..a7a5576 --- /dev/null +++ b/doc/classes/Mirimiri/Document.src/M000027.html @@ -0,0 +1,16 @@ + + + +
# File lib/mirimiri/document.rb, line 102 + def initialize(content) + @doc_content = content + @words = format_words + end+ + diff --git a/doc/classes/Mirimiri/WebDocument.html b/doc/classes/Mirimiri/WebDocument.html new file mode 100644 index 0000000..810a4c9 --- /dev/null +++ b/doc/classes/Mirimiri/WebDocument.html @@ -0,0 +1,209 @@ + + + +
Class | +Mirimiri::WebDocument | +
In: | +
+
+
+
+
+ lib/mirimiri/document.rb
+
+
+
+
+ + + |
+
Parent: | ++ + + + Mirimiri::Document + + + + | +
+A WebDocument is a Document with a url. +
+ +url | + +[R] | + ++ |
+Returns the HTML text from the page of a given url. +
+ ++WebDocument constructor, the content of the +Document is the HTML page without the tags. +
+ +# File lib/mirimiri/document.rb, line 115 + def self.get_content(url) + require 'net/http' + Net::HTTP.get(URI.parse(url)) + end+ + diff --git a/doc/classes/Mirimiri/WebDocument.src/M000029.html b/doc/classes/Mirimiri/WebDocument.src/M000029.html new file mode 100644 index 0000000..c98eb93 --- /dev/null +++ b/doc/classes/Mirimiri/WebDocument.src/M000029.html @@ -0,0 +1,16 @@ + + + +
# File lib/mirimiri/document.rb, line 122 + def initialize(url) + @url = url + super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags + end+ + diff --git a/doc/classes/Mirimiri/WikipediaPage.html b/doc/classes/Mirimiri/WikipediaPage.html new file mode 100644 index 0000000..83f945a --- /dev/null +++ b/doc/classes/Mirimiri/WikipediaPage.html @@ -0,0 +1,204 @@ + + + +
Class | +Mirimiri::WikipediaPage | +
In: | +
+
+
+
+
+ lib/mirimiri/document.rb
+
+
+
+
+ + + |
+
Parent: | ++ + + + Mirimiri::WebDocument + + + + | +
+A WikipediaPage is a WebDocument. +
+ +# File lib/mirimiri/document.rb, line 135 + def self.search_wikipedia_titles(name) + raise ArgumentError, "Bad encoding", name unless name.isutf8 + + res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search'] + + res.collect { |e| e.attributes['title'] } unless res.nil? + end+ + diff --git a/doc/classes/Mirimiri/WikipediaPage.src/M000031.html b/doc/classes/Mirimiri/WikipediaPage.src/M000031.html new file mode 100644 index 0000000..b5078fb --- /dev/null +++ b/doc/classes/Mirimiri/WikipediaPage.src/M000031.html @@ -0,0 +1,19 @@ + + + +
# File lib/mirimiri/document.rb, line 143 + def self.get_url(name) + raise ArgumentError, "Bad encoding", name unless name.isutf8 + + atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes + + atts['fullurl'] if atts['missing'].nil? + end+ + diff --git a/doc/classes/Mirimiri/WikipediaPage.src/M000032.html b/doc/classes/Mirimiri/WikipediaPage.src/M000032.html new file mode 100644 index 0000000..f369de0 --- /dev/null +++ b/doc/classes/Mirimiri/WikipediaPage.src/M000032.html @@ -0,0 +1,17 @@ + + + +
# File lib/mirimiri/document.rb, line 151 + def self.search_homepage(name) + title = WikipediaPage.search_wikipedia_titles name + + WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? + end+ + diff --git a/doc/files/lib/mirimiri/corpus_rb.html b/doc/files/lib/mirimiri/corpus_rb.html new file mode 100644 index 0000000..6558302 --- /dev/null +++ b/doc/files/lib/mirimiri/corpus_rb.html @@ -0,0 +1,90 @@ + + + +
Path: | +lib/mirimiri/corpus.rb + + | +
Last Update: | +2010-12-20 10:35:26 +0100 | +
Path: | +lib/mirimiri/document.rb + + | +
Last Update: | +2010-12-20 10:36:07 +0100 | +
Path: | +lib/mirimiri/query.rb + + | +
Last Update: | +2010-12-20 10:36:27 +0100 | +
Path: | +lib/mirimiri/regexp.rb + + | +
Last Update: | +2010-12-20 10:36:42 +0100 | +
Path: | +lib/mirimiri/string.rb + + | +
Last Update: | +2010-12-20 10:37:16 +0100 | +
+General module +
+ +Path: | +lib/mirimiri/ttagger.rb + + | +
Last Update: | +2010-12-20 10:37:32 +0100 | +
Path: | +lib/mirimiri.rb + + | +
Last Update: | +2010-12-20 10:33:51 +0100 | +