From fd4cb285a4975c9a0b6624e93a42eb9fa812fee4 Mon Sep 17 00:00:00 2001 From: Romain Deveaud Date: Sat, 13 Nov 2010 02:52:36 +0100 Subject: [PATCH] doc changes + document and string improvements --- doc/classes/String.html | 2 +- doc/created.rid | 2 +- doc/files/README_markdown.html | 33 ++++++++++++++++++++++++++++++++- doc/files/lib/rir/document_rb.html | 2 +- doc/files/lib/rir/string_rb.html | 2 +- doc/files/main_rb.html | 2 +- doc/fr_class_index.html | 8 ++++---- doc/fr_file_index.html | 4 ---- doc/fr_method_index.html | 14 +++++++------- doc/index.html | 2 +- lib/rir.rb | 1 + lib/rir/document.rb | 2 +- lib/rir/string.rb | 7 ++++--- main.rb | 8 ++++++++ 14 files changed, 63 insertions(+), 26 deletions(-) diff --git a/doc/classes/String.html b/doc/classes/String.html index 3ed6a1d..b0d3449 100644 --- a/doc/classes/String.html +++ b/doc/classes/String.html @@ -129,7 +129,7 @@ useful function.
- Rir + RIR
diff --git a/doc/created.rid b/doc/created.rid index 3035b6c..0b10800 100644 --- a/doc/created.rid +++ b/doc/created.rid @@ -1 +1 @@ -Fri, 05 Nov 2010 14:41:10 +0100 +Fri, 05 Nov 2010 15:06:41 +0100 diff --git a/doc/files/README_markdown.html b/doc/files/README_markdown.html index 013aed3..a52aaa0 100644 --- a/doc/files/README_markdown.html +++ b/doc/files/README_markdown.html @@ -53,7 +53,7 @@ Last Update: - 2010-11-05 14:40:41 +0100 + 2010-11-05 14:46:27 +0100 @@ -63,6 +63,37 @@
+
+

+# Ruby Information Retrieval (rIR) +

+

+Copyright (C) 2010-2011 Romain Deveaud +

+

+License +

+
=
+

+This program is free software: you can redistribute it and/or modify it +under the terms of the GNU General Public License as published by the Free +Software Foundation, either version 3 of the License, or (at your option) +any later version. +

+

+This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for +more details. +

+

+You should have received a copy of the GNU General Public License along +with this program. If not, see <www.gnu.org/licenses/>. +

+ +
+
diff --git a/doc/files/lib/rir/document_rb.html b/doc/files/lib/rir/document_rb.html index caddfbf..5dc4860 100644 --- a/doc/files/lib/rir/document_rb.html +++ b/doc/files/lib/rir/document_rb.html @@ -53,7 +53,7 @@ Last Update: - 2010-11-05 14:39:35 +0100 + 2010-11-05 15:06:24 +0100 diff --git a/doc/files/lib/rir/string_rb.html b/doc/files/lib/rir/string_rb.html index 961b94b..5b47834 100644 --- a/doc/files/lib/rir/string_rb.html +++ b/doc/files/lib/rir/string_rb.html @@ -53,7 +53,7 @@ Last Update: - 2010-11-05 14:39:35 +0100 + 2010-11-05 15:06:35 +0100 diff --git a/doc/files/main_rb.html b/doc/files/main_rb.html index 3f0de48..192bac7 100644 --- a/doc/files/main_rb.html +++ b/doc/files/main_rb.html @@ -53,7 +53,7 @@ Last Update: - 2010-11-05 14:40:11 +0100 + 2010-11-05 15:05:38 +0100 diff --git a/doc/fr_class_index.html b/doc/fr_class_index.html index 095130d..c330122 100644 --- a/doc/fr_class_index.html +++ b/doc/fr_class_index.html @@ -17,13 +17,13 @@

Classes

- Rir
+ RIR
- Rir::Document
+ RIR::Document
- Rir::WebDocument
+ RIR::WebDocument
- Rir::WikipediaPage
+ RIR::WikipediaPage
String
diff --git a/doc/fr_file_index.html b/doc/fr_file_index.html index 6a45fa1..045567f 100644 --- a/doc/fr_file_index.html +++ b/doc/fr_file_index.html @@ -17,10 +17,6 @@

Files

- README.markdown
- - lib/rir.rb
- lib/rir/document.rb
lib/rir/string.rb
diff --git a/doc/fr_method_index.html b/doc/fr_method_index.html index 3b25a4f..0379b48 100644 --- a/doc/fr_method_index.html +++ b/doc/fr_method_index.html @@ -17,23 +17,23 @@

Methods

- count_words (Rir::Document)
+ count_words (RIR::Document)
- entropy (Rir::Document)
+ entropy (RIR::Document)
extract_xmltags_values (String)
- format_words (Rir::Document)
+ format_words (RIR::Document)
- get_content (Rir::WebDocument)
+ get_content (RIR::WebDocument)
is_stopword? (String)
- new (Rir::WebDocument)
+ new (RIR::WebDocument)
- new (Rir::Document)
+ new (RIR::Document)
- ngrams (Rir::Document)
+ ngrams (RIR::Document)
remove_special_characters (String)
diff --git a/doc/index.html b/doc/index.html index ba843c5..3038b39 100644 --- a/doc/index.html +++ b/doc/index.html @@ -16,6 +16,6 @@ - + diff --git a/lib/rir.rb b/lib/rir.rb index 58b58ff..0f336e0 100644 --- a/lib/rir.rb +++ b/lib/rir.rb @@ -2,3 +2,4 @@ require 'rir/document' require 'rir/string' +require 'rir/query' diff --git a/lib/rir/document.rb b/lib/rir/document.rb index dc80db4..87a5c28 100644 --- a/lib/rir/document.rb +++ b/lib/rir/document.rb @@ -18,7 +18,7 @@ # along with this program. If not, see . # General module for many purposes related to Information Retrieval. -module Rir +module RIR # A Document is a bag of words and is constructed from a string. class Document diff --git a/lib/rir/string.rb b/lib/rir/string.rb index 250cc14..cbf4c23 100644 --- a/lib/rir/string.rb +++ b/lib/rir/string.rb @@ -18,7 +18,7 @@ # along with this program. If not, see . # General module for many purposes related to Information Retrieval. -module Rir +module RIR # These are the default stopwords provided by Lemur. Stoplist = [ @@ -71,7 +71,7 @@ end # Extention of the standard class String with useful function. class String - include Rir + include RIR # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise. def is_stopword? @@ -146,10 +146,11 @@ class String self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten end - private def strip_with_pattern(pattern) require 'cgi' require 'kconv' CGI::unescapeHTML(self.gsub(pattern,"")).toutf8 end + + private :strip_with_pattern end diff --git a/main.rb b/main.rb index 5546b36..0ea2e47 100644 --- a/main.rb +++ b/main.rb @@ -1,3 +1,11 @@ $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib")) require 'rir' + +w = RIR::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan") +p w.entropy("guitar") + +params = RIR::Indri::Parameters.new("path_vers_mon_index") +p params.rule +q = RIR::Indri::IndriQuery.new("pouet", "bla", params) +puts q -- 1.8.2.3