Commit 7043da90bf781276184a770f306cfe7b59c17d5a

Authored by Romain Deveaud
0 parents
Exists in master

first commit

Showing 37 changed files with 2767 additions and 0 deletions Inline Diff

File was created 1 # Ruby Information Retrieval (rIR)
2
3 Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
4
5 License
6 =======
7
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
20
doc/classes/Rir.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>Module: Rir [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="classHeader">
46 <table class="header-table">
47 <tr class="top-aligned-row">
48 <td><strong>Module</strong></td>
49 <td class="class-name-in-header">Rir</td>
50 </tr>
51 <tr class="top-aligned-row">
52 <td><strong>In:</strong></td>
53 <td>
54
55
56 <a href="../files/lib/rir/string_rb.html">
57
58 lib/rir/string.rb
59
60 </a>
61
62
63 <br />
64
65
66 <a href="../files/lib/rir/document_rb.html">
67
68 lib/rir/document.rb
69
70 </a>
71
72
73 <br />
74
75 </td>
76 </tr>
77
78
79 </table>
80 </div>
81 <!-- banner header -->
82
83 <div id="bodyContent">
84
85 <div id="contextContent">
86
87 <div id="description">
88 <p>
89 General module for many purposes related to Information Retrieval.
90 </p>
91 <hr size="1"></hr><p>
92 General module for many purposes related to Information Retrieval.
93 </p>
94
95 </div>
96
97 </div>
98
99
100 </div>
101
102 <!-- if includes -->
103
104 <div id="section">
105
106 <div id="class-list">
107 <h3 class="section-bar">Classes and Modules</h3>
108
109 Class <a href="Rir/Document.html" class="link">Rir::Document</a><br />
110 Class <a href="Rir/WebDocument.html" class="link">Rir::WebDocument</a><br />
111 Class <a href="Rir/WikipediaPage.html" class="link">Rir::WikipediaPage</a><br />
112
113 </div>
114
115 <div id="constants-list">
116 <h3 class="section-bar">Constants</h3>
117
118 <div class="name-list">
119 <table summary="Constants">
120
121 <tr class="top-aligned-row context-row">
122 <td class="context-item-name">Stoplist</td>
123 <td>=</td>
124 <td class="context-item-value">[ &quot;a&quot;, &quot;anything&quot;, &quot;anyway&quot;, &quot;anywhere&quot;, &quot;apart&quot;, &quot;are&quot;, &quot;around&quot;, &quot;as&quot;, &quot;at&quot;, &quot;av&quot;, &quot;be&quot;, &quot;became&quot;, &quot;because&quot;, &quot;become&quot;, &quot;becomes&quot;, &quot;becoming&quot;, &quot;been&quot;, &quot;before&quot;, &quot;beforehand&quot;, &quot;behind&quot;, &quot;being&quot;, &quot;below&quot;, &quot;beside&quot;, &quot;besides&quot;, &quot;between&quot;, &quot;beyond&quot;, &quot;both&quot;, &quot;but&quot;, &quot;by&quot;, &quot;can&quot;, &quot;cannot&quot;, &quot;canst&quot;, &quot;certain&quot;, &quot;cf&quot;, &quot;choose&quot;, &quot;contrariwise&quot;, &quot;cos&quot;, &quot;could&quot;, &quot;cu&quot;, &quot;day&quot;, &quot;do&quot;, &quot;does&quot;, &quot;doesn't&quot;, &quot;doing&quot;, &quot;dost&quot;, &quot;doth&quot;, &quot;double&quot;, &quot;down&quot;, &quot;dual&quot;, &quot;during&quot;, &quot;each&quot;, &quot;either&quot;, &quot;else&quot;, &quot;elsewhere&quot;, &quot;enough&quot;, &quot;et&quot;, &quot;etc&quot;, &quot;even&quot;, &quot;ever&quot;, &quot;every&quot;, &quot;everybody&quot;, &quot;everyone&quot;, &quot;everything&quot;, &quot;everywhere&quot;, &quot;except&quot;, &quot;excepted&quot;, &quot;excepting&quot;, &quot;exception&quot;, &quot;exclude&quot;, &quot;excluding&quot;, &quot;exclusive&quot;, &quot;far&quot;, &quot;farther&quot;, &quot;farthest&quot;, &quot;few&quot;, &quot;ff&quot;, &quot;first&quot;, &quot;for&quot;, &quot;formerly&quot;, &quot;forth&quot;, &quot;forward&quot;, &quot;from&quot;, &quot;front&quot;, &quot;further&quot;, &quot;furthermore&quot;, &quot;furthest&quot;, &quot;get&quot;, &quot;go&quot;, &quot;had&quot;, &quot;halves&quot;, &quot;hardly&quot;, &quot;has&quot;, &quot;hast&quot;, &quot;hath&quot;, &quot;have&quot;, &quot;he&quot;, &quot;hence&quot;, &quot;henceforth&quot;, &quot;her&quot;, &quot;here&quot;, &quot;hereabouts&quot;, &quot;hereafter&quot;, &quot;hereby&quot;, &quot;herein&quot;, &quot;hereto&quot;, &quot;hereupon&quot;, &quot;hers&quot;, &quot;herself&quot;, &quot;him&quot;, &quot;himself&quot;, &quot;hindmost&quot;, &quot;his&quot;, &quot;hither&quot;, &quot;hitherto&quot;, &quot;how&quot;, &quot;however&quot;, &quot;howsoever&quot;, &quot;i&quot;, &quot;ie&quot;, &quot;if&quot;, &quot;in&quot;, &quot;inasmuch&quot;, &quot;inc&quot;, &quot;include&quot;, &quot;included&quot;, &quot;including&quot;, &quot;indeed&quot;, &quot;indoors&quot;, &quot;inside&quot;, &quot;insomuch&quot;, &quot;instead&quot;, &quot;into&quot;, &quot;inward&quot;, &quot;inwards&quot;, &quot;is&quot;, &quot;it&quot;, &quot;its&quot;, &quot;itself&quot;, &quot;just&quot;, &quot;kind&quot;, &quot;kg&quot;, &quot;km&quot;, &quot;last&quot;, &quot;latter&quot;, &quot;latterly&quot;, &quot;less&quot;, &quot;lest&quot;, &quot;let&quot;, &quot;like&quot;, &quot;little&quot;, &quot;ltd&quot;, &quot;many&quot;, &quot;may&quot;, &quot;maybe&quot;, &quot;me&quot;, &quot;meantime&quot;, &quot;meanwhile&quot;, &quot;might&quot;, &quot;moreover&quot;, &quot;most&quot;, &quot;mostly&quot;, &quot;more&quot;, &quot;mr&quot;, &quot;mrs&quot;, &quot;ms&quot;, &quot;much&quot;, &quot;must&quot;, &quot;my&quot;, &quot;myself&quot;, &quot;namely&quot;, &quot;need&quot;, &quot;neither&quot;, &quot;never&quot;, &quot;nevertheless&quot;, &quot;next&quot;, &quot;no&quot;, &quot;nobody&quot;, &quot;none&quot;, &quot;nonetheless&quot;, &quot;noone&quot;, &quot;nope&quot;, &quot;nor&quot;, &quot;not&quot;, &quot;nothing&quot;, &quot;notwithstanding&quot;, &quot;now&quot;, &quot;nowadays&quot;, &quot;nowhere&quot;, &quot;of&quot;, &quot;off&quot;, &quot;often&quot;, &quot;ok&quot;, &quot;on&quot;, &quot;once&quot;, &quot;one&quot;, &quot;only&quot;, &quot;onto&quot;, &quot;or&quot;, &quot;other&quot;, &quot;others&quot;, &quot;otherwise&quot;, &quot;ought&quot;, &quot;our&quot;, &quot;ours&quot;, &quot;ourselves&quot;, &quot;out&quot;, &quot;outside&quot;, &quot;over&quot;, &quot;own&quot;, &quot;per&quot;, &quot;perhaps&quot;, &quot;plenty&quot;, &quot;provide&quot;, &quot;quite&quot;, &quot;rather&quot;, &quot;really&quot;, &quot;round&quot;, &quot;said&quot;, &quot;sake&quot;, &quot;same&quot;, &quot;sang&quot;, &quot;save&quot;, &quot;saw&quot;, &quot;see&quot;, &quot;seeing&quot;, &quot;seem&quot;, &quot;seemed&quot;, &quot;seeming&quot;, &quot;seems&quot;, &quot;seen&quot;, &quot;seldom&quot;, &quot;selves&quot;, &quot;sent&quot;, &quot;several&quot;, &quot;shalt&quot;, &quot;she&quot;, &quot;should&quot;, &quot;shown&quot;, &quot;sideways&quot;, &quot;since&quot;, &quot;slept&quot;, &quot;slew&quot;, &quot;slung&quot;, &quot;slunk&quot;, &quot;smote&quot;, &quot;so&quot;, &quot;some&quot;, &quot;somebody&quot;, &quot;somehow&quot;, &quot;someone&quot;, &quot;something&quot;, &quot;sometime&quot;, &quot;sometimes&quot;, &quot;somewhat&quot;, &quot;somewhere&quot;, &quot;spake&quot;, &quot;spat&quot;, &quot;spoke&quot;, &quot;spoken&quot;, &quot;sprang&quot;, &quot;sprung&quot;, &quot;stave&quot;, &quot;staves&quot;, &quot;still&quot;, &quot;such&quot;, &quot;supposing&quot;, &quot;than&quot;, &quot;that&quot;, &quot;the&quot;, &quot;thee&quot;, &quot;their&quot;, &quot;them&quot;, &quot;themselves&quot;, &quot;then&quot;, &quot;thence&quot;, &quot;thenceforth&quot;, &quot;there&quot;, &quot;thereabout&quot;, &quot;thereabouts&quot;, &quot;thereafter&quot;, &quot;thereby&quot;, &quot;therefore&quot;, &quot;therein&quot;, &quot;thereof&quot;, &quot;thereon&quot;, &quot;thereto&quot;, &quot;thereupon&quot;, &quot;these&quot;, &quot;they&quot;, &quot;this&quot;, &quot;those&quot;, &quot;thou&quot;, &quot;though&quot;, &quot;thrice&quot;, &quot;through&quot;, &quot;throughout&quot;, &quot;thru&quot;, &quot;thus&quot;, &quot;thy&quot;, &quot;thyself&quot;, &quot;till&quot;, &quot;to&quot;, &quot;together&quot;, &quot;too&quot;, &quot;toward&quot;, &quot;towards&quot;, &quot;ugh&quot;, &quot;unable&quot;, &quot;under&quot;, &quot;underneath&quot;, &quot;unless&quot;, &quot;unlike&quot;, &quot;until&quot;, &quot;up&quot;, &quot;upon&quot;, &quot;upward&quot;, &quot;upwards&quot;, &quot;us&quot;, &quot;use&quot;, &quot;used&quot;, &quot;using&quot;, &quot;very&quot;, &quot;via&quot;, &quot;vs&quot;, &quot;want&quot;, &quot;was&quot;, &quot;we&quot;, &quot;week&quot;, &quot;well&quot;, &quot;were&quot;, &quot;what&quot;, &quot;whatever&quot;, &quot;whatsoever&quot;, &quot;when&quot;, &quot;whence&quot;, &quot;whenever&quot;, &quot;whensoever&quot;, &quot;where&quot;, &quot;whereabouts&quot;, &quot;whereafter&quot;, &quot;whereas&quot;, &quot;whereat&quot;, &quot;whereby&quot;, &quot;wherefore&quot;, &quot;wherefrom&quot;, &quot;wherein&quot;, &quot;whereinto&quot;, &quot;whereof&quot;, &quot;whereon&quot;, &quot;wheresoever&quot;, &quot;whereto&quot;, &quot;whereunto&quot;, &quot;whereupon&quot;, &quot;wherever&quot;, &quot;wherewith&quot;, &quot;whether&quot;, &quot;whew&quot;, &quot;which&quot;, &quot;whichever&quot;, &quot;whichsoever&quot;, &quot;while&quot;, &quot;whilst&quot;, &quot;whither&quot;, &quot;who&quot;, &quot;whoa&quot;, &quot;whoever&quot;, &quot;whole&quot;, &quot;whom&quot;, &quot;whomever&quot;, &quot;whomsoever&quot;, &quot;whose&quot;, &quot;whosoever&quot;, &quot;why&quot;, &quot;will&quot;, &quot;wilt&quot;, &quot;with&quot;, &quot;within&quot;, &quot;without&quot;, &quot;worse&quot;, &quot;worst&quot;, &quot;would&quot;, &quot;wow&quot;, &quot;ye&quot;, &quot;yet&quot;, &quot;year&quot;, &quot;yippee&quot;, &quot;you&quot;, &quot;your&quot;, &quot;yours&quot;, &quot;yourself&quot;, &quot;yourselves&quot; ]</td>
125
126 <td>&nbsp;</td>
127 <td class="context-item-desc">
128 These are the default stopwords provided by Lemur.
129
130 </td>
131
132 </tr>
133
134 </table>
135 </div>
136 </div>
137
138
139
140
141 <!-- if method_list -->
142
143
144
145
146 </div>
147
148 <div id="validator-badges">
149 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
150 </div>
151
152 </body>
153 </html>
154
doc/classes/Rir/Document.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>Class: Rir::Document [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="classHeader">
46 <table class="header-table">
47 <tr class="top-aligned-row">
48 <td><strong>Class</strong></td>
49 <td class="class-name-in-header">Rir::Document</td>
50 </tr>
51 <tr class="top-aligned-row">
52 <td><strong>In:</strong></td>
53 <td>
54
55
56 <a href="../../files/lib/rir/document_rb.html">
57
58 lib/rir/document.rb
59
60 </a>
61
62
63 <br />
64
65 </td>
66 </tr>
67
68
69 <tr class="top-aligned-row">
70 <td><strong>Parent:</strong></td>
71 <td>
72
73 Object
74
75 </td>
76 </tr>
77
78 </table>
79 </div>
80 <!-- banner header -->
81
82 <div id="bodyContent">
83
84 <div id="contextContent">
85
86 <div id="description">
87 <p>
88 A <a href="Document.html">Document</a> is a bag of words and is constructed
89 from a string.
90 </p>
91
92 </div>
93
94 </div>
95
96
97 <div id="method-list">
98 <h3 class="section-bar">Methods</h3>
99
100 <div class="name-list">
101
102 <a href="#M000012">count_words</a>&nbsp;&nbsp;
103
104 <a href="#M000013">entropy</a>&nbsp;&nbsp;
105
106 <a href="#M000010">format_words</a>&nbsp;&nbsp;
107
108 <a href="#M000014">new</a>&nbsp;&nbsp;
109
110 <a href="#M000011">ngrams</a>&nbsp;&nbsp;
111
112 </div>
113 </div>
114
115 </div>
116
117 <!-- if includes -->
118
119 <div id="section">
120
121
122
123 <div id="attribute-list">
124 <h3 class="section-bar">Attributes</h3>
125
126 <div class="name-list">
127 <table>
128
129 <tr class="top-aligned-row context-row">
130 <td class="context-item-name">doc_content</td>
131
132 <td class="context-item-value">&nbsp;[R]&nbsp;</td>
133
134 <td class="context-item-desc"></td>
135 </tr>
136
137 <tr class="top-aligned-row context-row">
138 <td class="context-item-name">words</td>
139
140 <td class="context-item-value">&nbsp;[R]&nbsp;</td>
141
142 <td class="context-item-desc"></td>
143 </tr>
144
145 </table>
146 </div>
147 </div>
148
149
150 <!-- if method_list -->
151
152 <div id="methods">
153
154 <h3 class="section-bar">Public Class methods</h3>
155
156
157 <div id="method-M000014" class="method-detail">
158 <a name="M000014"></a>
159
160 <div class="method-heading">
161
162 <a href="Document.src/M000014.html" target="Code" class="method-signature"
163 onclick="popupCode('Document.src/M000014.html');return false;">
164
165 <span class="method-name">new</span><span class="method-args">(content)</span>
166
167 </a>
168
169 </div>
170
171 <div class="method-description">
172
173 </div>
174 </div>
175
176
177 <h3 class="section-bar">Public Instance methods</h3>
178
179
180 <div id="method-M000012" class="method-detail">
181 <a name="M000012"></a>
182
183 <div class="method-heading">
184
185 <a href="Document.src/M000012.html" target="Code" class="method-signature"
186 onclick="popupCode('Document.src/M000012.html');return false;">
187
188 <span class="method-name">count_words</span><span class="method-args">()</span>
189
190 </a>
191
192 </div>
193
194 <div class="method-description">
195
196 <p>
197 Returns a Hash containing the words and their associated counts in the
198 current <a href="Document.html">Document</a>.
199 </p>
200 <pre>
201 count_words #=&gt; { &quot;guitar&quot;=&gt;1, &quot;bass&quot;=&gt;3, &quot;album&quot;=&gt;20, ... }
202 </pre>
203
204 </div>
205 </div>
206
207
208 <div id="method-M000013" class="method-detail">
209 <a name="M000013"></a>
210
211 <div class="method-heading">
212
213 <a href="Document.src/M000013.html" target="Code" class="method-signature"
214 onclick="popupCode('Document.src/M000013.html');return false;">
215
216 <span class="method-name">entropy</span><span class="method-args">(s)</span>
217
218 </a>
219
220 </div>
221
222 <div class="method-description">
223
224 <p>
225 Computes the entropy of a given string <tt>s</tt> inside the document.
226 </p>
227 <p>
228 If the string parameter is composed of many words (i.e. tokens separated by
229 whitespace(s)), it is considered as an ngram.
230 </p>
231 <pre>
232 entropy(&quot;guitar&quot;) #=&gt; 0.00389919463243839
233 </pre>
234
235 </div>
236 </div>
237
238
239 <div id="method-M000011" class="method-detail">
240 <a name="M000011"></a>
241
242 <div class="method-heading">
243
244 <a href="Document.src/M000011.html" target="Code" class="method-signature"
245 onclick="popupCode('Document.src/M000011.html');return false;">
246
247 <span class="method-name">ngrams</span><span class="method-args">(n)</span>
248
249 </a>
250
251 </div>
252
253 <div class="method-description">
254
255 <p>
256 Returns an Array containing the <tt>n</tt>-grams (words) from the current
257 <a href="Document.html">Document</a>.
258 </p>
259 <pre>
260 ngrams(2) #=&gt; [&quot;the free&quot;, &quot;free encyclopedia&quot;, &quot;encyclopedia var&quot;, &quot;var skin&quot;, ...]
261 </pre>
262
263 </div>
264 </div>
265
266
267 <h3 class="section-bar">Protected Instance methods</h3>
268
269
270 <div id="method-M000010" class="method-detail">
271 <a name="M000010"></a>
272
273 <div class="method-heading">
274
275 <a href="Document.src/M000010.html" target="Code" class="method-signature"
276 onclick="popupCode('Document.src/M000010.html');return false;">
277
278 <span class="method-name">format_words</span><span class="method-args">()</span>
279
280 </a>
281
282 </div>
283
284 <div class="method-description">
285
286 <p>
287 Any non-word characters are removed from the words (see <a
288 href="http://perldoc.perl.org/perlre.html">perldoc.perl.org/perlre.html</a>
289 and the W special escape).
290 </p>
291 <p>
292 Protected function, only meant to by called at the initialization.
293 </p>
294
295 </div>
296 </div>
297
298
299
300 </div>
301
302
303
304
305 </div>
306
307 <div id="validator-badges">
308 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
309 </div>
310
311 </body>
312 </html>
313
doc/classes/Rir/Document.src/M000010.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>format_words (Rir::Document)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/rir/document.rb, line 31</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">format_words</span>
12 <span class="ruby-identifier">wo</span> = []
13
14 <span class="ruby-ivar">@doc_content</span>.<span class="ruby-identifier">split</span>.<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span>
15 <span class="ruby-identifier">w</span>.<span class="ruby-identifier">split</span>(<span class="ruby-regexp re">/\W/</span>).<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">sw</span><span class="ruby-operator">|</span>
16 <span class="ruby-identifier">wo</span>.<span class="ruby-identifier">push</span>(<span class="ruby-identifier">sw</span>) <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">sw</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/[a-zA-Z]/</span>
17 <span class="ruby-keyword kw">end</span>
18 <span class="ruby-keyword kw">end</span>
19
20 <span class="ruby-identifier">wo</span>
21 <span class="ruby-keyword kw">end</span></pre>
22 </body>
23 </html>
24
doc/classes/Rir/Document.src/M000011.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>ngrams (Rir::Document)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/rir/document.rb, line 46</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">ngrams</span>(<span class="ruby-identifier">n</span>)
12 <span class="ruby-identifier">window</span> = []
13 <span class="ruby-identifier">ngrams_array</span> = []
14
15 <span class="ruby-ivar">@words</span>.<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span>
16 <span class="ruby-identifier">window</span>.<span class="ruby-identifier">push</span>(<span class="ruby-identifier">w</span>)
17 <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">window</span>.<span class="ruby-identifier">size</span> <span class="ruby-operator">==</span> <span class="ruby-identifier">n</span>
18 <span class="ruby-identifier">ngrams_array</span>.<span class="ruby-identifier">push</span> <span class="ruby-identifier">window</span>.<span class="ruby-identifier">join</span>(<span class="ruby-value str">&quot; &quot;</span>)
19 <span class="ruby-identifier">window</span>.<span class="ruby-identifier">delete_at</span>(<span class="ruby-value">0</span>)
20 <span class="ruby-keyword kw">end</span>
21 <span class="ruby-keyword kw">end</span>
22
23 <span class="ruby-identifier">ngrams_array</span>.<span class="ruby-identifier">uniq</span>
24 <span class="ruby-keyword kw">end</span></pre>
25 </body>
26 </html>
27
doc/classes/Rir/Document.src/M000012.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>count_words (Rir::Document)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/rir/document.rb, line 64</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">count_words</span>
12 <span class="ruby-identifier">counts</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">h</span>,<span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">h</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-value">0</span> }
13 <span class="ruby-ivar">@words</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span> <span class="ruby-identifier">counts</span>[<span class="ruby-identifier">w</span>.<span class="ruby-identifier">downcase</span>] <span class="ruby-operator">+=</span> <span class="ruby-value">1</span> }
14
15 <span class="ruby-identifier">counts</span>
16 <span class="ruby-keyword kw">end</span></pre>
17 </body>
18 </html>
19
doc/classes/Rir/Document.src/M000013.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>entropy (Rir::Document)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/rir/document.rb, line 77</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">entropy</span>(<span class="ruby-identifier">s</span>)
12 <span class="ruby-identifier">en</span> = <span class="ruby-value">0</span><span class="ruby-value">.0</span>
13 <span class="ruby-identifier">counts</span> = <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">count_words</span>
14
15 <span class="ruby-identifier">s</span>.<span class="ruby-identifier">split</span>.<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span>
16 <span class="ruby-identifier">p_wi</span> = <span class="ruby-identifier">counts</span>[<span class="ruby-identifier">w</span>].<span class="ruby-identifier">to_f</span><span class="ruby-operator">/</span><span class="ruby-ivar">@words</span>.<span class="ruby-identifier">count</span>.<span class="ruby-identifier">to_f</span>
17 <span class="ruby-identifier">en</span> <span class="ruby-operator">+=</span> <span class="ruby-identifier">p_wi</span><span class="ruby-operator">*</span><span class="ruby-constant">Math</span>.<span class="ruby-identifier">log2</span>(<span class="ruby-identifier">p_wi</span>)
18 <span class="ruby-keyword kw">end</span>
19
20 <span class="ruby-identifier">en</span> <span class="ruby-operator">*=</span> <span class="ruby-value">-1</span>
21 <span class="ruby-identifier">en</span>
22 <span class="ruby-keyword kw">end</span></pre>
23 </body>
24 </html>
25
doc/classes/Rir/Document.src/M000014.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>new (Rir::Document)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/rir/document.rb, line 92</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">content</span>)
12 <span class="ruby-ivar">@doc_content</span> = <span class="ruby-identifier">content</span>
13 <span class="ruby-ivar">@words</span> = <span class="ruby-identifier">format_words</span>
14 <span class="ruby-keyword kw">end</span></pre>
15 </body>
16 </html>
17
doc/classes/Rir/WebDocument.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>Class: Rir::WebDocument [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="classHeader">
46 <table class="header-table">
47 <tr class="top-aligned-row">
48 <td><strong>Class</strong></td>
49 <td class="class-name-in-header">Rir::WebDocument</td>
50 </tr>
51 <tr class="top-aligned-row">
52 <td><strong>In:</strong></td>
53 <td>
54
55
56 <a href="../../files/lib/rir/document_rb.html">
57
58 lib/rir/document.rb
59
60 </a>
61
62
63 <br />
64
65 </td>
66 </tr>
67
68
69 <tr class="top-aligned-row">
70 <td><strong>Parent:</strong></td>
71 <td>
72
73 <a href="Document.html">
74
75 Rir::Document
76
77 </a>
78
79 </td>
80 </tr>
81
82 </table>
83 </div>
84 <!-- banner header -->
85
86 <div id="bodyContent">
87
88 <div id="contextContent">
89
90 <div id="description">
91 <p>
92 A <a href="WebDocument.html">WebDocument</a> is a <a
93 href="Document.html">Document</a> with a <tt>url</tt>.
94 </p>
95
96 </div>
97
98 </div>
99
100
101 <div id="method-list">
102 <h3 class="section-bar">Methods</h3>
103
104 <div class="name-list">
105
106 <a href="#M000015">get_content</a>&nbsp;&nbsp;
107
108 <a href="#M000016">new</a>&nbsp;&nbsp;
109
110 </div>
111 </div>
112
113 </div>
114
115 <!-- if includes -->
116
117 <div id="section">
118
119
120
121 <div id="attribute-list">
122 <h3 class="section-bar">Attributes</h3>
123
124 <div class="name-list">
125 <table>
126
127 <tr class="top-aligned-row context-row">
128 <td class="context-item-name">url</td>
129
130 <td class="context-item-value">&nbsp;[R]&nbsp;</td>
131
132 <td class="context-item-desc"></td>
133 </tr>
134
135 </table>
136 </div>
137 </div>
138
139
140 <!-- if method_list -->
141
142 <div id="methods">
143
144 <h3 class="section-bar">Public Class methods</h3>
145
146
147 <div id="method-M000015" class="method-detail">
148 <a name="M000015"></a>
149
150 <div class="method-heading">
151
152 <a href="WebDocument.src/M000015.html" target="Code" class="method-signature"
153 onclick="popupCode('WebDocument.src/M000015.html');return false;">
154
155 <span class="method-name">get_content</span><span class="method-args">(url)</span>
156
157 </a>
158
159 </div>
160
161 <div class="method-description">
162
163 <p>
164 Returns the HTML text from the page of a given <tt>url</tt>.
165 </p>
166
167 </div>
168 </div>
169
170
171 <div id="method-M000016" class="method-detail">
172 <a name="M000016"></a>
173
174 <div class="method-heading">
175
176 <a href="WebDocument.src/M000016.html" target="Code" class="method-signature"
177 onclick="popupCode('WebDocument.src/M000016.html');return false;">
178
179 <span class="method-name">new</span><span class="method-args">(url)</span>
180
181 </a>
182
183 </div>
184
185 <div class="method-description">
186
187 <p>
188 <a href="WebDocument.html">WebDocument</a> constructor, the content of the
189 <a href="Document.html">Document</a> is the HTML page without the tags.
190 </p>
191
192 </div>
193 </div>
194
195
196
197 </div>
198
199
200
201
202 </div>
203
204 <div id="validator-badges">
205 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
206 </div>
207
208 </body>
209 </html>
210
doc/classes/Rir/WebDocument.src/M000015.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>get_content (Rir::WebDocument)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/rir/document.rb, line 105</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">get_content</span>(<span class="ruby-identifier">url</span>)
12 <span class="ruby-identifier">require</span> <span class="ruby-value str">'net/http'</span>
13 <span class="ruby-constant">Net</span><span class="ruby-operator">::</span><span class="ruby-constant">HTTP</span>.<span class="ruby-identifier">get</span>(<span class="ruby-constant">URI</span>.<span class="ruby-identifier">parse</span>(<span class="ruby-identifier">url</span>))
14 <span class="ruby-keyword kw">end</span></pre>
15 </body>
16 </html>
17
doc/classes/Rir/WebDocument.src/M000016.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>new (Rir::WebDocument)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/rir/document.rb, line 112</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">url</span>)
12 <span class="ruby-ivar">@url</span> = <span class="ruby-identifier">url</span>
13 <span class="ruby-keyword kw">super</span> <span class="ruby-constant">WebDocument</span>.<span class="ruby-identifier">get_content</span>(<span class="ruby-identifier">url</span>).<span class="ruby-identifier">strip_javascripts</span>.<span class="ruby-identifier">strip_stylesheets</span>.<span class="ruby-identifier">strip_xml_tags</span>
14 <span class="ruby-keyword kw">end</span></pre>
15 </body>
16 </html>
17
doc/classes/Rir/WikipediaPage.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>Class: Rir::WikipediaPage [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="classHeader">
46 <table class="header-table">
47 <tr class="top-aligned-row">
48 <td><strong>Class</strong></td>
49 <td class="class-name-in-header">Rir::WikipediaPage</td>
50 </tr>
51 <tr class="top-aligned-row">
52 <td><strong>In:</strong></td>
53 <td>
54
55
56 <a href="../../files/lib/rir/document_rb.html">
57
58 lib/rir/document.rb
59
60 </a>
61
62
63 <br />
64
65 </td>
66 </tr>
67
68
69 <tr class="top-aligned-row">
70 <td><strong>Parent:</strong></td>
71 <td>
72
73 <a href="WebDocument.html">
74
75 Rir::WebDocument
76
77 </a>
78
79 </td>
80 </tr>
81
82 </table>
83 </div>
84 <!-- banner header -->
85
86 <div id="bodyContent">
87
88 <div id="contextContent">
89
90 <div id="description">
91 <p>
92 A <a href="WikipediaPage.html">WikipediaPage</a> is a <a
93 href="WebDocument.html">WebDocument</a>.
94 </p>
95
96 </div>
97
98 </div>
99
100
101 </div>
102
103 <!-- if includes -->
104
105 <div id="section">
106
107
108
109
110 <!-- if method_list -->
111
112
113
114
115 </div>
116
117 <div id="validator-badges">
118 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
119 </div>
120
121 </body>
122 </html>
123
doc/classes/String.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>Class: String [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="classHeader">
46 <table class="header-table">
47 <tr class="top-aligned-row">
48 <td><strong>Class</strong></td>
49 <td class="class-name-in-header">String</td>
50 </tr>
51 <tr class="top-aligned-row">
52 <td><strong>In:</strong></td>
53 <td>
54
55
56 <a href="../files/lib/rir/string_rb.html">
57
58 lib/rir/string.rb
59
60 </a>
61
62
63 <br />
64
65 </td>
66 </tr>
67
68
69 <tr class="top-aligned-row">
70 <td><strong>Parent:</strong></td>
71 <td>
72
73 Object
74
75 </td>
76 </tr>
77
78 </table>
79 </div>
80 <!-- banner header -->
81
82 <div id="bodyContent">
83
84 <div id="contextContent">
85
86 <div id="description">
87 <p>
88 Extention of the standard class <a href="String.html">String</a> with
89 useful function.
90 </p>
91
92 </div>
93
94 </div>
95
96
97 <div id="method-list">
98 <h3 class="section-bar">Methods</h3>
99
100 <div class="name-list">
101
102 <a href="#M000009">extract_xmltags_values</a>&nbsp;&nbsp;
103
104 <a href="#M000001">is_stopword?</a>&nbsp;&nbsp;
105
106 <a href="#M000002">remove_special_characters</a>&nbsp;&nbsp;
107
108 <a href="#M000006">strip_javascripts</a>&nbsp;&nbsp;
109
110 <a href="#M000005">strip_javascripts!</a>&nbsp;&nbsp;
111
112 <a href="#M000008">strip_stylesheets</a>&nbsp;&nbsp;
113
114 <a href="#M000007">strip_stylesheets!</a>&nbsp;&nbsp;
115
116 <a href="#M000004">strip_xml_tags</a>&nbsp;&nbsp;
117
118 <a href="#M000003">strip_xml_tags!</a>&nbsp;&nbsp;
119
120 </div>
121 </div>
122
123 </div>
124
125 <!-- if includes -->
126
127 <div id="includes">
128 <h3 class="section-bar">Included Modules</h3>
129
130 <div id="includes-list">
131
132 <span class="include-name"><a href="Rir.html">Rir</a></span>
133
134 </div>
135 </div>
136
137 <div id="section">
138
139
140
141
142 <!-- if method_list -->
143
144 <div id="methods">
145
146 <h3 class="section-bar">Public Instance methods</h3>
147
148
149 <div id="method-M000009" class="method-detail">
150 <a name="M000009"></a>
151
152 <div class="method-heading">
153
154 <a href="String.src/M000009.html" target="Code" class="method-signature"
155 onclick="popupCode('String.src/M000009.html');return false;">
156
157 <span class="method-name">extract_xmltags_values</span><span class="method-args">(tag_name)</span>
158
159 </a>
160
161 </div>
162
163 <div class="method-description">
164
165 <p>
166 Returns the text values inside all occurences of a XML tag in <tt>self</tt>
167 </p>
168 <pre>
169 s = &quot;four-piece in &lt;a href='#'&gt;Indianapolis&lt;/a&gt;, &lt;a href='#'&gt;Indiana&lt;/a&gt; at the Murat Theatre&quot;
170 s.extract_xmltags_values 'a' #=&gt; [&quot;Indianapolis&quot;, &quot;Indiana&quot;]
171 </pre>
172
173 </div>
174 </div>
175
176
177 <div id="method-M000001" class="method-detail">
178 <a name="M000001"></a>
179
180 <div class="method-heading">
181
182 <a href="String.src/M000001.html" target="Code" class="method-signature"
183 onclick="popupCode('String.src/M000001.html');return false;">
184
185 <span class="method-name">is_stopword?</span><span class="method-args">()</span>
186
187 </a>
188
189 </div>
190
191 <div class="method-description">
192
193 <p>
194 Returns <tt>true</tt> if <tt>self</tt> belongs to Rir::Stoplist,
195 <tt>false</tt> otherwise.
196 </p>
197
198 </div>
199 </div>
200
201
202 <div id="method-M000002" class="method-detail">
203 <a name="M000002"></a>
204
205 <div class="method-heading">
206
207 <a href="String.src/M000002.html" target="Code" class="method-signature"
208 onclick="popupCode('String.src/M000002.html');return false;">
209
210 <span class="method-name">remove_special_characters</span><span class="method-args">()</span>
211
212 </a>
213
214 </div>
215
216 <div class="method-description">
217
218 <p>
219 Do not use. TODO: rewamp. find why this function is here.
220 </p>
221
222 </div>
223 </div>
224
225
226 <div id="method-M000006" class="method-detail">
227 <a name="M000006"></a>
228
229 <div class="method-heading">
230
231 <a href="String.src/M000006.html" target="Code" class="method-signature"
232 onclick="popupCode('String.src/M000006.html');return false;">
233
234 <span class="method-name">strip_javascripts</span><span class="method-args">()</span>
235
236 </a>
237
238 </div>
239
240 <div class="method-description">
241
242 <p>
243 Removes all Javascript sources from <tt>self</tt>.
244 </p>
245 <pre>
246 s = &quot;&lt;script type='text/javascript'&gt;
247 var skin='vector',
248 stylepath='http://bits.wikimedia.org/skins-1.5'
249 &lt;/script&gt;
250
251 test&quot;
252 s.strip_javascripts #=&gt; &quot;test&quot;
253 </pre>
254
255 </div>
256 </div>
257
258
259 <div id="method-M000005" class="method-detail">
260 <a name="M000005"></a>
261
262 <div class="method-heading">
263
264 <a href="String.src/M000005.html" target="Code" class="method-signature"
265 onclick="popupCode('String.src/M000005.html');return false;">
266
267 <span class="method-name">strip_javascripts!</span><span class="method-args">()</span>
268
269 </a>
270
271 </div>
272
273 <div class="method-description">
274
275 <p>
276 Removes all Javascript sources from <tt>self</tt>.
277 </p>
278 <pre>
279 s = &quot;&lt;script type='text/javascript'&gt;
280 var skin='vector',
281 stylepath='http://bits.wikimedia.org/skins-1.5'
282 &lt;/script&gt;
283
284 test&quot;
285 s.strip_javascripts!
286 s #=&gt; &quot;test&quot;
287 </pre>
288
289 </div>
290 </div>
291
292
293 <div id="method-M000008" class="method-detail">
294 <a name="M000008"></a>
295
296 <div class="method-heading">
297
298 <a href="String.src/M000008.html" target="Code" class="method-signature"
299 onclick="popupCode('String.src/M000008.html');return false;">
300
301 <span class="method-name">strip_stylesheets</span><span class="method-args">()</span>
302
303 </a>
304
305 </div>
306
307 <div class="method-description">
308
309 </div>
310 </div>
311
312
313 <div id="method-M000007" class="method-detail">
314 <a name="M000007"></a>
315
316 <div class="method-heading">
317
318 <a href="String.src/M000007.html" target="Code" class="method-signature"
319 onclick="popupCode('String.src/M000007.html');return false;">
320
321 <span class="method-name">strip_stylesheets!</span><span class="method-args">()</span>
322
323 </a>
324
325 </div>
326
327 <div class="method-description">
328
329 </div>
330 </div>
331
332
333 <div id="method-M000004" class="method-detail">
334 <a name="M000004"></a>
335
336 <div class="method-heading">
337
338 <a href="String.src/M000004.html" target="Code" class="method-signature"
339 onclick="popupCode('String.src/M000004.html');return false;">
340
341 <span class="method-name">strip_xml_tags</span><span class="method-args">()</span>
342
343 </a>
344
345 </div>
346
347 <div class="method-description">
348
349 <p>
350 Removes all XML-like tags from <tt>self</tt>.
351 </p>
352 <pre>
353 s = &quot;&lt;html&gt;&lt;body&gt;test&lt;/body&gt;&lt;/html&gt;&quot;
354 s.strip_xml_tags #=&gt; &quot;test&quot;
355 s #=&gt; &quot;&lt;html&gt;&lt;body&gt;test&lt;/body&gt;&lt;/html&gt;&quot;
356 </pre>
357
358 </div>
359 </div>
360
361
362 <div id="method-M000003" class="method-detail">
363 <a name="M000003"></a>
364
365 <div class="method-heading">
366
367 <a href="String.src/M000003.html" target="Code" class="method-signature"
368 onclick="popupCode('String.src/M000003.html');return false;">
369
370 <span class="method-name">strip_xml_tags!</span><span class="method-args">()</span>
371
372 </a>
373
374 </div>
375
376 <div class="method-description">
377
378 <p>
379 Removes all XML-like tags from <tt>self</tt>.
380 </p>
381 <pre>
382 s = &quot;&lt;html&gt;&lt;body&gt;test&lt;/body&gt;&lt;/html&gt;&quot;
383 s.strip_xml_tags!
384 s #=&gt; &quot;test&quot;
385 </pre>
386
387 </div>
388 </div>
389
390
391
392 </div>
393
394
395
396
397 </div>
398
399 <div id="validator-badges">
400 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
401 </div>
402
403 </body>
404 </html>
405
doc/classes/String.src/M000001.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>is_stopword? (String)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/rir/string.rb, line 77</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">is_stopword?</span>
12 <span class="ruby-constant">Stoplist</span>.<span class="ruby-identifier">include?</span>(<span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">downcase</span>)
13 <span class="ruby-keyword kw">end</span></pre>
14 </body>
15 </html>
16
doc/classes/String.src/M000002.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>remove_special_characters (String)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/rir/string.rb, line 83</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">remove_special_characters</span>
12 <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">split</span>.<span class="ruby-identifier">collect</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span> <span class="ruby-identifier">w</span>.<span class="ruby-identifier">gsub</span>(<span class="ruby-regexp re">/\W/</span>,<span class="ruby-value str">' '</span>).<span class="ruby-identifier">split</span>.<span class="ruby-identifier">collect</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span> <span class="ruby-identifier">w</span>.<span class="ruby-identifier">gsub</span>(<span class="ruby-regexp re">/\W/</span>,<span class="ruby-value str">' '</span>).<span class="ruby-identifier">strip</span>.<span class="ruby-identifier">sub</span>(<span class="ruby-regexp re">/\A.\z/</span>, <span class="ruby-value str">''</span>)}.<span class="ruby-identifier">join</span>(<span class="ruby-value str">' '</span>).<span class="ruby-identifier">strip</span>.<span class="ruby-identifier">sub</span>(<span class="ruby-regexp re">/\A.\z/</span>, <span class="ruby-value str">''</span>)}.<span class="ruby-identifier">join</span>(<span class="ruby-value str">' '</span>)
13 <span class="ruby-keyword kw">end</span></pre>
14 </body>
15 </html>
16
doc/classes/String.src/M000003.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>strip_xml_tags! (String)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/rir/string.rb, line 92</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">strip_xml_tags!</span>
12 <span class="ruby-identifier">replace</span> <span class="ruby-identifier">strip_with_pattern</span> <span class="ruby-operator">/</span><span class="ruby-operator">&lt;</span>\<span class="ruby-regexp re">/?[^&gt;]*&gt;/</span>
13 <span class="ruby-keyword kw">end</span></pre>
14 </body>
15 </html>
16
doc/classes/String.src/M000004.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>strip_xml_tags (String)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/rir/string.rb, line 101</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">strip_xml_tags</span>
12 <span class="ruby-identifier">dup</span>.<span class="ruby-identifier">strip_xml_tags!</span>
13 <span class="ruby-keyword kw">end</span></pre>
14 </body>
15 </html>
16
doc/classes/String.src/M000005.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>strip_javascripts! (String)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/rir/string.rb, line 115</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">strip_javascripts!</span>
12 <span class="ruby-identifier">replace</span> <span class="ruby-identifier">strip_with_pattern</span> <span class="ruby-operator">/</span><span class="ruby-operator">&lt;</span><span class="ruby-identifier">script</span> <span class="ruby-identifier">type</span>=<span class="ruby-value str">&quot;text\/javascript&quot;</span><span class="ruby-operator">&gt;</span>(.<span class="ruby-operator">+</span><span class="ruby-value">?)</span><span class="ruby-operator">&lt;</span>\<span class="ruby-regexp re">/script&gt;/</span><span class="ruby-identifier">m</span>
13 <span class="ruby-keyword kw">end</span></pre>
14 </body>
15 </html>
16
doc/classes/String.src/M000006.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>strip_javascripts (String)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/rir/string.rb, line 128</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">strip_javascripts</span>
12 <span class="ruby-identifier">dup</span>.<span class="ruby-identifier">strip_javascripts!</span>
13 <span class="ruby-keyword kw">end</span></pre>
14 </body>
15 </html>
16
doc/classes/String.src/M000007.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>strip_stylesheets! (String)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/rir/string.rb, line 132</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">strip_stylesheets!</span>
12 <span class="ruby-comment cmt"># TODO: rewamp. dunno what is it.</span>
13 <span class="ruby-identifier">replace</span> <span class="ruby-identifier">strip_with_pattern</span> <span class="ruby-operator">/</span><span class="ruby-operator">&lt;</span><span class="ruby-identifier">style</span> <span class="ruby-identifier">type</span>=<span class="ruby-value str">&quot;text\/css&quot;</span><span class="ruby-operator">&gt;</span>(.<span class="ruby-operator">+</span><span class="ruby-value">?)</span><span class="ruby-operator">&lt;</span>\<span class="ruby-regexp re">/style&gt;/</span><span class="ruby-identifier">m</span>
14 <span class="ruby-keyword kw">end</span></pre>
15 </body>
16 </html>
17
doc/classes/String.src/M000008.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>strip_stylesheets (String)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/rir/string.rb, line 137</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">strip_stylesheets</span>
12 <span class="ruby-identifier">dup</span>.<span class="ruby-identifier">strip_stylesheets!</span>
13 <span class="ruby-keyword kw">end</span></pre>
14 </body>
15 </html>
16
doc/classes/String.src/M000009.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>extract_xmltags_values (String)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/rir/string.rb, line 145</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">extract_xmltags_values</span>(<span class="ruby-identifier">tag_name</span>)
12 <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">scan</span>(<span class="ruby-node">/&lt;#{tag_name}.*?&gt;(.+?)&lt;\/#{tag_name}&gt;/</span>).<span class="ruby-identifier">flatten</span>
13 <span class="ruby-keyword kw">end</span></pre>
14 </body>
15 </html>
16
File was created 1 Fri, 05 Nov 2010 14:41:10 +0100
2
doc/files/README_markdown.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>File: README.markdown [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="fileHeader">
46 <h1>README.markdown</h1>
47 <table class="header-table">
48 <tr class="top-aligned-row">
49 <td><strong>Path:</strong></td>
50 <td>README.markdown
51
52 </td>
53 </tr>
54 <tr class="top-aligned-row">
55 <td><strong>Last Update:</strong></td>
56 <td>2010-11-05 14:40:41 +0100</td>
57 </tr>
58 </table>
59 </div>
60 <!-- banner header -->
61
62 <div id="bodyContent">
63
64 <div id="contextContent">
65
66 </div>
67
68
69 </div>
70
71 <!-- if includes -->
72
73 <div id="section">
74
75
76
77
78 <!-- if method_list -->
79
80
81
82
83 </div>
84
85 <div id="validator-badges">
86 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
87 </div>
88
89 </body>
90 </html>
91
doc/files/lib/rir/document_rb.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>File: document.rb [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="fileHeader">
46 <h1>document.rb</h1>
47 <table class="header-table">
48 <tr class="top-aligned-row">
49 <td><strong>Path:</strong></td>
50 <td>lib/rir/document.rb
51
52 </td>
53 </tr>
54 <tr class="top-aligned-row">
55 <td><strong>Last Update:</strong></td>
56 <td>2010-11-05 14:39:35 +0100</td>
57 </tr>
58 </table>
59 </div>
60 <!-- banner header -->
61
62 <div id="bodyContent">
63
64 <div id="contextContent">
65
66 <div id="description">
67 <p>
68 This file is a part of an Information Retrieval oriented Ruby library
69 </p>
70 <p>
71 Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
72 </p>
73 <p>
74 This program is free software: you can redistribute it and/or modify it
75 under the terms of the GNU General Public License as published by the Free
76 Software Foundation, either version 3 of the License, or (at your option)
77 any later version.
78 </p>
79 <p>
80 This program is distributed in the hope that it will be useful, but WITHOUT
81 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
82 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
83 more details.
84 </p>
85 <p>
86 You should have received a copy of the GNU General Public License along
87 with this program. If not, see <<a
88 href="http://www.gnu.org/licenses/">www.gnu.org/licenses/</a>>.
89 </p>
90
91 </div>
92
93 <div id="requires-list">
94 <h3 class="section-bar">Required files</h3>
95
96 <div class="name-list">
97
98 net/http&nbsp;&nbsp;
99
100 </div>
101 </div>
102
103 </div>
104
105
106 </div>
107
108 <!-- if includes -->
109
110 <div id="section">
111
112
113
114
115 <!-- if method_list -->
116
117
118
119
120 </div>
121
122 <div id="validator-badges">
123 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
124 </div>
125
126 </body>
127 </html>
128
doc/files/lib/rir/string_rb.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>File: string.rb [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="fileHeader">
46 <h1>string.rb</h1>
47 <table class="header-table">
48 <tr class="top-aligned-row">
49 <td><strong>Path:</strong></td>
50 <td>lib/rir/string.rb
51
52 </td>
53 </tr>
54 <tr class="top-aligned-row">
55 <td><strong>Last Update:</strong></td>
56 <td>2010-11-05 14:39:35 +0100</td>
57 </tr>
58 </table>
59 </div>
60 <!-- banner header -->
61
62 <div id="bodyContent">
63
64 <div id="contextContent">
65
66 <div id="description">
67 <p>
68 This file is a part of an Information Retrieval oriented Ruby library
69 </p>
70 <p>
71 Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
72 </p>
73 <p>
74 This program is free software: you can redistribute it and/or modify it
75 under the terms of the GNU General Public License as published by the Free
76 Software Foundation, either version 3 of the License, or (at your option)
77 any later version.
78 </p>
79 <p>
80 This program is distributed in the hope that it will be useful, but WITHOUT
81 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
82 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
83 more details.
84 </p>
85 <p>
86 You should have received a copy of the GNU General Public License along
87 with this program. If not, see <<a
88 href="http://www.gnu.org/licenses/">www.gnu.org/licenses/</a>>.
89 </p>
90
91 </div>
92
93 <div id="requires-list">
94 <h3 class="section-bar">Required files</h3>
95
96 <div class="name-list">
97
98 cgi&nbsp;&nbsp;
99
100 kconv&nbsp;&nbsp;
101
102 </div>
103 </div>
104
105 </div>
106
107
108 </div>
109
110 <!-- if includes -->
111
112 <div id="section">
113
114
115
116
117 <!-- if method_list -->
118
119
120
121
122 </div>
123
124 <div id="validator-badges">
125 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
126 </div>
127
128 </body>
129 </html>
130
doc/files/lib/rir_rb.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>File: rir.rb [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="fileHeader">
46 <h1>rir.rb</h1>
47 <table class="header-table">
48 <tr class="top-aligned-row">
49 <td><strong>Path:</strong></td>
50 <td>lib/rir.rb
51
52 </td>
53 </tr>
54 <tr class="top-aligned-row">
55 <td><strong>Last Update:</strong></td>
56 <td>2010-11-05 14:39:35 +0100</td>
57 </tr>
58 </table>
59 </div>
60 <!-- banner header -->
61
62 <div id="bodyContent">
63
64 <div id="contextContent">
65
66 <div id="requires-list">
67 <h3 class="section-bar">Required files</h3>
68
69 <div class="name-list">
70
71 rir/document&nbsp;&nbsp;
72
73 rir/string&nbsp;&nbsp;
74
75 </div>
76 </div>
77
78 </div>
79
80
81 </div>
82
83 <!-- if includes -->
84
85 <div id="section">
86
87
88
89
90 <!-- if method_list -->
91
92
93
94
95 </div>
96
97 <div id="validator-badges">
98 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
99 </div>
100
101 </body>
102 </html>
103
doc/files/main_rb.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>File: main.rb [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="fileHeader">
46 <h1>main.rb</h1>
47 <table class="header-table">
48 <tr class="top-aligned-row">
49 <td><strong>Path:</strong></td>
50 <td>main.rb
51
52 </td>
53 </tr>
54 <tr class="top-aligned-row">
55 <td><strong>Last Update:</strong></td>
56 <td>2010-11-05 14:40:11 +0100</td>
57 </tr>
58 </table>
59 </div>
60 <!-- banner header -->
61
62 <div id="bodyContent">
63
64 <div id="contextContent">
65
66 <div id="requires-list">
67 <h3 class="section-bar">Required files</h3>
68
69 <div class="name-list">
70
71 rir&nbsp;&nbsp;
72
73 </div>
74 </div>
75
76 </div>
77
78
79 </div>
80
81 <!-- if includes -->
82
83 <div id="section">
84
85
86
87
88 <!-- if method_list -->
89
90
91
92
93 </div>
94
95 <div id="validator-badges">
96 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
97 </div>
98
99 </body>
100 </html>
101
doc/fr_class_index.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <!--
5
6 Classes [RDoc Documentation]
7
8 -->
9 <head>
10 <title>Classes [RDoc Documentation]</title>
11 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
12 <link rel="stylesheet" href="rdoc-style.css" type="text/css" />
13 <base target="docwin" />
14 </head>
15 <body>
16 <div class="index">
17 <h1 class="section-bar">Classes</h1>
18 <div id="index-entries">
19
20 <a href="classes/Rir.html">Rir</a><br />
21
22 <a href="classes/Rir/Document.html">Rir::Document</a><br />
23
24 <a href="classes/Rir/WebDocument.html">Rir::WebDocument</a><br />
25
26 <a href="classes/Rir/WikipediaPage.html">Rir::WikipediaPage</a><br />
27
28 <a href="classes/String.html">String</a><br />
29
30 </div>
31 </div>
32 </body>
33 </html>
34
doc/fr_file_index.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <!--
5
6 Files [RDoc Documentation]
7
8 -->
9 <head>
10 <title>Files [RDoc Documentation]</title>
11 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
12 <link rel="stylesheet" href="rdoc-style.css" type="text/css" />
13 <base target="docwin" />
14 </head>
15 <body>
16 <div class="index">
17 <h1 class="section-bar">Files</h1>
18 <div id="index-entries">
19
20 <a href="files/README_markdown.html">README.markdown</a><br />
21
22 <a href="files/lib/rir_rb.html">lib/rir.rb</a><br />
23
24 <a href="files/lib/rir/document_rb.html">lib/rir/document.rb</a><br />
25
26 <a href="files/lib/rir/string_rb.html">lib/rir/string.rb</a><br />
27
28 <a href="files/main_rb.html">main.rb</a><br />
29
30 </div>
31 </div>
32 </body>
33 </html>
34
doc/fr_method_index.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <!--
5
6 Methods [RDoc Documentation]
7
8 -->
9 <head>
10 <title>Methods [RDoc Documentation]</title>
11 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
12 <link rel="stylesheet" href="rdoc-style.css" type="text/css" />
13 <base target="docwin" />
14 </head>
15 <body>
16 <div class="index">
17 <h1 class="section-bar">Methods</h1>
18 <div id="index-entries">
19
20 <a href="classes/Rir/Document.html#M000012">count_words (Rir::Document)</a><br />
21
22 <a href="classes/Rir/Document.html#M000013">entropy (Rir::Document)</a><br />
23
24 <a href="classes/String.html#M000009">extract_xmltags_values (String)</a><br />
25
26 <a href="classes/Rir/Document.html#M000010">format_words (Rir::Document)</a><br />
27
28 <a href="classes/Rir/WebDocument.html#M000015">get_content (Rir::WebDocument)</a><br />
29
30 <a href="classes/String.html#M000001">is_stopword? (String)</a><br />
31
32 <a href="classes/Rir/WebDocument.html#M000016">new (Rir::WebDocument)</a><br />
33
34 <a href="classes/Rir/Document.html#M000014">new (Rir::Document)</a><br />
35
36 <a href="classes/Rir/Document.html#M000011">ngrams (Rir::Document)</a><br />
37
38 <a href="classes/String.html#M000002">remove_special_characters (String)</a><br />
39
40 <a href="classes/String.html#M000006">strip_javascripts (String)</a><br />
41
42 <a href="classes/String.html#M000005">strip_javascripts! (String)</a><br />
43
44 <a href="classes/String.html#M000008">strip_stylesheets (String)</a><br />
45
46 <a href="classes/String.html#M000007">strip_stylesheets! (String)</a><br />
47
48 <a href="classes/String.html#M000004">strip_xml_tags (String)</a><br />
49
50 <a href="classes/String.html#M000003">strip_xml_tags! (String)</a><br />
51
52 </div>
53 </div>
54 </body>
55 </html>
56
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <!--
5
6 RDoc Documentation
7
8 -->
9 <head>
10 <title>RDoc Documentation</title>
11 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
12 </head>
13 <frameset rows="20%, 80%">
14 <frameset cols="25%,35%,45%">
15 <frame src="fr_file_index.html" title="Files" name="Files" />
16 <frame src="fr_class_index.html" name="Classes" />
17 <frame src="fr_method_index.html" name="Methods" />
18 </frameset>
19 <frame src="files/README_markdown.html" name="docwin" />
20 </frameset>
21 </html>
22
File was created 1 body {
2 font-family: Verdana,Arial,Helvetica,sans-serif;
3 font-size: 90%;
4 margin: 0;
5 margin-left: 40px;
6 padding: 0;
7 background: white;
8 color: black;
9 }
10
11 h1, h2, h3, h4 {
12 margin: 0;
13 background: transparent;
14 }
15
16 h1 {
17 font-size: 150%;
18 }
19
20 h2,h3,h4 {
21 margin-top: 1em;
22 }
23
24 :link, :visited {
25 background: #eef;
26 color: #039;
27 text-decoration: none;
28 }
29
30 :link:hover, :visited:hover {
31 background: #039;
32 color: #eef;
33 }
34
35 /* Override the base stylesheet's Anchor inside a table cell */
36 td > :link, td > :visited {
37 background: transparent;
38 color: #039;
39 text-decoration: none;
40 }
41
42 /* and inside a section title */
43 .section-title > :link, .section-title > :visited {
44 background: transparent;
45 color: #eee;
46 text-decoration: none;
47 }
48
49 /* === Structural elements =================================== */
50
51 .index {
52 margin: 0;
53 margin-left: -40px;
54 padding: 0;
55 font-size: 90%;
56 }
57
58 .index :link, .index :visited {
59 margin-left: 0.7em;
60 }
61
62 .index .section-bar {
63 margin-left: 0px;
64 padding-left: 0.7em;
65 background: #ccc;
66 font-size: small;
67 }
68
69 #classHeader, #fileHeader {
70 width: auto;
71 color: white;
72 padding: 0.5em 1.5em 0.5em 1.5em;
73 margin: 0;
74 margin-left: -40px;
75 border-bottom: 3px solid #006;
76 }
77
78 #classHeader :link, #fileHeader :link,
79 #classHeader :visited, #fileHeader :visited {
80 background: inherit;
81 color: white;
82 }
83
84 #classHeader td, #fileHeader td {
85 background: inherit;
86 color: white;
87 }
88
89 #fileHeader {
90 background: #057;
91 }
92
93 #classHeader {
94 background: #048;
95 }
96
97 .class-name-in-header {
98 font-size: 180%;
99 font-weight: bold;
100 }
101
102 #bodyContent {
103 padding: 0 1.5em 0 1.5em;
104 }
105
106 #description {
107 padding: 0.5em 1.5em;
108 background: #efefef;
109 border: 1px dotted #999;
110 }
111
112 #description h1, #description h2, #description h3,
113 #description h4, #description h5, #description h6 {
114 color: #125;
115 background: transparent;
116 }
117
118 #validator-badges {
119 text-align: center;
120 }
121
122 #validator-badges img {
123 border: 0;
124 }
125
126 #copyright {
127 color: #333;
128 background: #efefef;
129 font: 0.75em sans-serif;
130 margin-top: 5em;
131 margin-bottom: 0;
132 padding: 0.5em 2em;
133 }
134
135 /* === Classes =================================== */
136
137 table.header-table {
138 color: white;
139 font-size: small;
140 }
141
142 .type-note {
143 font-size: small;
144 color: #dedede;
145 }
146
147 .section-bar {
148 color: #333;
149 border-bottom: 1px solid #999;
150 margin-left: -20px;
151 }
152
153 .section-title {
154 background: #79a;
155 color: #eee;
156 padding: 3px;
157 margin-top: 2em;
158 margin-left: -30px;
159 border: 1px solid #999;
160 }
161
162 .top-aligned-row {
163 vertical-align: top
164 }
165
166 .bottom-aligned-row {
167 vertical-align: bottom
168 }
169
170 #diagram img {
171 border: 0;
172 }
173
174 /* --- Context section classes ----------------------- */
175
176 .context-row { }
177
178 .context-item-name {
179 font-family: monospace;
180 font-weight: bold;
181 color: black;
182 }
183
184 .context-item-value {
185 font-size: small;
186 color: #448;
187 }
188
189 .context-item-desc {
190 color: #333;
191 padding-left: 2em;
192 }
193
194 /* --- Method classes -------------------------- */
195
196 .method-detail {
197 background: #efefef;
198 padding: 0;
199 margin-top: 0.5em;
200 margin-bottom: 1em;
201 border: 1px dotted #ccc;
202 }
203
204 .method-heading {
205 color: black;
206 background: #ccc;
207 border-bottom: 1px solid #666;
208 padding: 0.2em 0.5em 0 0.5em;
209 }
210
211 .method-signature {
212 color: black;
213 background: inherit;
214 }
215
216 .method-name {
217 font-weight: bold;
218 }
219
220 .method-args {
221 font-style: italic;
222 }
223
224 .method-description {
225 padding: 0 0.5em 0 0.5em;
226 }
227
228 /* --- Source code sections -------------------- */
229
230 :link.source-toggle, :visited.source-toggle {
231 font-size: 90%;
232 }
233
234 div.method-source-code {
235 background: #262626;
236 color: #ffdead;
237 margin: 1em;
238 padding: 0.5em;
239 border: 1px dashed #999;
240 overflow: auto;
241 }
242
243 div.method-source-code pre {
244 color: #ffdead;
245 }
246
247 /* --- Ruby keyword styles --------------------- */
248
249 .standalone-code {
250 background: #221111;
251 color: #ffdead;
252 overflow: auto;
253 }
254
255 .ruby-constant {
256 color: #7fffd4;
257 background: transparent;
258 }
259
260 .ruby-keyword {
261 color: #00ffff;
262 background: transparent;
263 }
264
265 .ruby-ivar {
266 color: #eedd82;
267 background: transparent;
268 }
269
270 .ruby-operator {
271 color: #00ffee;
272 background: transparent;
273 }
274
275 .ruby-identifier {
276 color: #ffdead;
277 background: transparent;
278 }
279
280 .ruby-node {
281 color: #ffa07a;
282 background: transparent;
283 }
284
285 .ruby-comment {
286 color: #b22222;
287 font-weight: bold;
288 background: transparent;
289 }
290
291 .ruby-regexp {
292 color: #ffa07a;
293 background: transparent;
294 }
295
296 .ruby-value {
297 color: #7fffd4;
298 background: transparent;
299 }
300
File was created 1 #!/usr/bin/env ruby
2
3 require 'rir/document'
4 require 'rir/string'
5
File was created 1 #!/usr/bin/env ruby
2
3 # This file is a part of an Information Retrieval oriented Ruby library
4 #
5 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
6 #
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11 #
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
16 #
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
20 # General module for many purposes related to Information Retrieval.
21 module Rir
22
23 # A Document is a bag of words and is constructed from a string.
24 class Document
25 attr_reader :words, :doc_content
26
27 # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
28 # and the \\W special escape).
29 #
30 # Protected function, only meant to by called at the initialization.
31 def format_words
32 wo = []
33
34 @doc_content.split.each do |w|
35 w.split(/\W/).each do |sw|
36 wo.push(sw) if sw =~ /[a-zA-Z]/
37 end
38 end
39
40 wo
41 end
42
43 # Returns an Array containing the +n+-grams (words) from the current Document.
44 #
45 # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
46 def ngrams(n)
47 window = []
48 ngrams_array = []
49
50 @words.each do |w|
51 window.push(w)
52 if window.size == n
53 ngrams_array.push window.join(" ")
54 window.delete_at(0)
55 end
56 end
57
58 ngrams_array.uniq
59 end
60
61 # Returns a Hash containing the words and their associated counts in the current Document.
62 #
63 # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
64 def count_words
65 counts = Hash.new { |h,k| h[k] = 0 }
66 @words.each { |w| counts[w.downcase] += 1 }
67
68 counts
69 end
70
71 # Computes the entropy of a given string +s+ inside the document.
72 #
73 # If the string parameter is composed of many words (i.e. tokens separated
74 # by whitespace(s)), it is considered as an ngram.
75 #
76 # entropy("guitar") #=> 0.00389919463243839
77 def entropy(s)
78 en = 0.0
79 counts = self.count_words
80
81 s.split.each do |w|
82 p_wi = counts[w].to_f/@words.count.to_f
83 en += p_wi*Math.log2(p_wi)
84 end
85
86 en *= -1
87 en
88 end
89
90
91
92 def initialize(content)
93 @doc_content = content
94 @words = format_words
95 end
96
97 protected :format_words
98 end
99
100 # A WebDocument is a Document with a +url+.
101 class WebDocument < Document
102 attr_reader :url
103
104 # Returns the HTML text from the page of a given +url+.
105 def self.get_content(url)
106 require 'net/http'
107 Net::HTTP.get(URI.parse(url))
108 end
109
110 # WebDocument constructor, the content of the Document is the HTML page
111 # without the tags.
112 def initialize(url)
113 @url = url
114 super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags
115 end
116 end
117
118 # A WikipediaPage is a WebDocument.
119 class WikipediaPage < WebDocument
120 end
121 end
122
File was created 1 #!/usr/bin/env ruby
2
3 # This file is a part of an Information Retrieval oriented Ruby library
4 #
5 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
6 #
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11 #
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
16 #
17 # You should have received a copy of the GNU General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
20 # General module for many purposes related to Information Retrieval.
21 module Rir
22
23 # These are the default stopwords provided by Lemur.
24 Stoplist = [
25 "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av",
26 "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand",
27 "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by",
28 "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu",
29 "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during",
30 "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every",
31 "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting",
32 "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff",
33 "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore",
34 "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he",
35 "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto",
36 "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto",
37 "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include",
38 "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into",
39 "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last",
40 "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe",
41 "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs",
42 "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless",
43 "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing",
44 "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once",
45 "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours",
46 "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite",
47 "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing",
48 "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt",
49 "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote",
50 "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes",
51 "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave",
52 "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them",
53 "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts",
54 "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon",
55 "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru",
56 "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh",
57 "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward",
58 "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week",
59 "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever",
60 "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore",
61 "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto",
62 "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever",
63 "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom",
64 "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within",
65 "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your",
66 "yours", "yourself", "yourselves"
67 ]
68
69
70 end
71
72 # Extention of the standard class String with useful function.
73 class String
74 include Rir
75
76 # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise.
77 def is_stopword?
78 Stoplist.include?(self.downcase)
79 end
80
81 # Do not use.
82 # TODO: rewamp. find why this function is here.
83 def remove_special_characters
84 self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ')
85 end
86
87 # Removes all XML-like tags from +self+.
88 #
89 # s = "<html><body>test</body></html>"
90 # s.strip_xml_tags!
91 # s #=> "test"
92 def strip_xml_tags!
93 replace strip_with_pattern /<\/?[^>]*>/
94 end
95
96 # Removes all XML-like tags from +self+.
97 #
98 # s = "<html><body>test</body></html>"
99 # s.strip_xml_tags #=> "test"
100 # s #=> "<html><body>test</body></html>"
101 def strip_xml_tags
102 dup.strip_xml_tags!
103 end
104
105 # Removes all Javascript sources from +self+.
106 #
107 # s = "<script type='text/javascript'>
108 # var skin='vector',
109 # stylepath='http://bits.wikimedia.org/skins-1.5'
110 # </script>
111 #
112 # test"
113 # s.strip_javascripts!
114 # s #=> "test"
115 def strip_javascripts!
116 replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m
117 end
118
119 # Removes all Javascript sources from +self+.
120 #
121 # s = "<script type='text/javascript'>
122 # var skin='vector',
123 # stylepath='http://bits.wikimedia.org/skins-1.5'
124 # </script>
125 #
126 # test"
127 # s.strip_javascripts #=> "test"
128 def strip_javascripts
129 dup.strip_javascripts!
130 end
131
132 def strip_stylesheets!
133 # TODO: rewamp. dunno what is it.
134 replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m
135 end
136
137 def strip_stylesheets
138 dup.strip_stylesheets!
139 end
140
141 # Returns the text values inside all occurences of a XML tag in +self+
142 #
143 # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre"
144 # s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"]
145 def extract_xmltags_values(tag_name)
146 self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten
147 end
148
149 private
150 def strip_with_pattern(pattern)
151 require 'cgi'
152 require 'kconv'
153 CGI::unescapeHTML(self.gsub(pattern,"")).toutf8
154 end
155 end
156
File was created 1 $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib"))
2
3 require 'rir'
4