Commit cd74322524114fbad147da48f608384d03e46c58

Authored by Romain Deveaud
1 parent b3995017e6
Exists in master

adding missing files

Showing 29 changed files with 2390 additions and 0 deletions Inline Diff

doc/classes/Mirimiri.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>Module: Mirimiri [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="classHeader">
46 <table class="header-table">
47 <tr class="top-aligned-row">
48 <td><strong>Module</strong></td>
49 <td class="class-name-in-header">Mirimiri</td>
50 </tr>
51 <tr class="top-aligned-row">
52 <td><strong>In:</strong></td>
53 <td>
54
55
56 <a href="../files/lib/mirimiri/string_rb.html">
57
58 lib/mirimiri/string.rb
59
60 </a>
61
62
63 <br />
64
65
66 <a href="../files/lib/mirimiri/document_rb.html">
67
68 lib/mirimiri/document.rb
69
70 </a>
71
72
73 <br />
74
75 </td>
76 </tr>
77
78
79 </table>
80 </div>
81 <!-- banner header -->
82
83 <div id="bodyContent">
84
85 <div id="contextContent">
86
87 <div id="description">
88 <hr size="1"></hr><p>
89 General module
90 </p>
91
92 </div>
93
94 </div>
95
96
97 </div>
98
99 <!-- if includes -->
100
101 <div id="section">
102
103 <div id="class-list">
104 <h3 class="section-bar">Classes and Modules</h3>
105
106 Class <a href="Mirimiri/Document.html" class="link">Mirimiri::Document</a><br />
107 Class <a href="Mirimiri/WebDocument.html" class="link">Mirimiri::WebDocument</a><br />
108 Class <a href="Mirimiri/WikipediaPage.html" class="link">Mirimiri::WikipediaPage</a><br />
109
110 </div>
111
112 <div id="constants-list">
113 <h3 class="section-bar">Constants</h3>
114
115 <div class="name-list">
116 <table summary="Constants">
117
118 <tr class="top-aligned-row context-row">
119 <td class="context-item-name">Stoplist</td>
120 <td>=</td>
121 <td class="context-item-value">[ &quot;a&quot;, &quot;anything&quot;, &quot;anyway&quot;, &quot;anywhere&quot;, &quot;apart&quot;, &quot;are&quot;, &quot;around&quot;, &quot;as&quot;, &quot;at&quot;, &quot;av&quot;, &quot;be&quot;, &quot;became&quot;, &quot;because&quot;, &quot;become&quot;, &quot;becomes&quot;, &quot;becoming&quot;, &quot;been&quot;, &quot;before&quot;, &quot;beforehand&quot;, &quot;behind&quot;, &quot;being&quot;, &quot;below&quot;, &quot;beside&quot;, &quot;besides&quot;, &quot;between&quot;, &quot;beyond&quot;, &quot;both&quot;, &quot;but&quot;, &quot;by&quot;, &quot;can&quot;, &quot;cannot&quot;, &quot;canst&quot;, &quot;certain&quot;, &quot;cf&quot;, &quot;choose&quot;, &quot;contrariwise&quot;, &quot;cos&quot;, &quot;could&quot;, &quot;cu&quot;, &quot;day&quot;, &quot;do&quot;, &quot;does&quot;, &quot;doesn't&quot;, &quot;doing&quot;, &quot;dost&quot;, &quot;doth&quot;, &quot;double&quot;, &quot;down&quot;, &quot;dual&quot;, &quot;during&quot;, &quot;each&quot;, &quot;either&quot;, &quot;else&quot;, &quot;elsewhere&quot;, &quot;enough&quot;, &quot;et&quot;, &quot;etc&quot;, &quot;even&quot;, &quot;ever&quot;, &quot;every&quot;, &quot;everybody&quot;, &quot;everyone&quot;, &quot;everything&quot;, &quot;everywhere&quot;, &quot;except&quot;, &quot;excepted&quot;, &quot;excepting&quot;, &quot;exception&quot;, &quot;exclude&quot;, &quot;excluding&quot;, &quot;exclusive&quot;, &quot;far&quot;, &quot;farther&quot;, &quot;farthest&quot;, &quot;few&quot;, &quot;ff&quot;, &quot;first&quot;, &quot;for&quot;, &quot;formerly&quot;, &quot;forth&quot;, &quot;forward&quot;, &quot;from&quot;, &quot;front&quot;, &quot;further&quot;, &quot;furthermore&quot;, &quot;furthest&quot;, &quot;get&quot;, &quot;go&quot;, &quot;had&quot;, &quot;halves&quot;, &quot;hardly&quot;, &quot;has&quot;, &quot;hast&quot;, &quot;hath&quot;, &quot;have&quot;, &quot;he&quot;, &quot;hence&quot;, &quot;henceforth&quot;, &quot;her&quot;, &quot;here&quot;, &quot;hereabouts&quot;, &quot;hereafter&quot;, &quot;hereby&quot;, &quot;herein&quot;, &quot;hereto&quot;, &quot;hereupon&quot;, &quot;hers&quot;, &quot;herself&quot;, &quot;him&quot;, &quot;himself&quot;, &quot;hindmost&quot;, &quot;his&quot;, &quot;hither&quot;, &quot;hitherto&quot;, &quot;how&quot;, &quot;however&quot;, &quot;howsoever&quot;, &quot;i&quot;, &quot;ie&quot;, &quot;if&quot;, &quot;in&quot;, &quot;inasmuch&quot;, &quot;inc&quot;, &quot;include&quot;, &quot;included&quot;, &quot;including&quot;, &quot;indeed&quot;, &quot;indoors&quot;, &quot;inside&quot;, &quot;insomuch&quot;, &quot;instead&quot;, &quot;into&quot;, &quot;inward&quot;, &quot;inwards&quot;, &quot;is&quot;, &quot;it&quot;, &quot;its&quot;, &quot;itself&quot;, &quot;just&quot;, &quot;kind&quot;, &quot;kg&quot;, &quot;km&quot;, &quot;last&quot;, &quot;latter&quot;, &quot;latterly&quot;, &quot;less&quot;, &quot;lest&quot;, &quot;let&quot;, &quot;like&quot;, &quot;little&quot;, &quot;ltd&quot;, &quot;many&quot;, &quot;may&quot;, &quot;maybe&quot;, &quot;me&quot;, &quot;meantime&quot;, &quot;meanwhile&quot;, &quot;might&quot;, &quot;moreover&quot;, &quot;most&quot;, &quot;mostly&quot;, &quot;more&quot;, &quot;mr&quot;, &quot;mrs&quot;, &quot;ms&quot;, &quot;much&quot;, &quot;must&quot;, &quot;my&quot;, &quot;myself&quot;, &quot;namely&quot;, &quot;need&quot;, &quot;neither&quot;, &quot;never&quot;, &quot;nevertheless&quot;, &quot;next&quot;, &quot;no&quot;, &quot;nobody&quot;, &quot;none&quot;, &quot;nonetheless&quot;, &quot;noone&quot;, &quot;nope&quot;, &quot;nor&quot;, &quot;not&quot;, &quot;nothing&quot;, &quot;notwithstanding&quot;, &quot;now&quot;, &quot;nowadays&quot;, &quot;nowhere&quot;, &quot;of&quot;, &quot;off&quot;, &quot;often&quot;, &quot;ok&quot;, &quot;on&quot;, &quot;once&quot;, &quot;one&quot;, &quot;only&quot;, &quot;onto&quot;, &quot;or&quot;, &quot;other&quot;, &quot;others&quot;, &quot;otherwise&quot;, &quot;ought&quot;, &quot;our&quot;, &quot;ours&quot;, &quot;ourselves&quot;, &quot;out&quot;, &quot;outside&quot;, &quot;over&quot;, &quot;own&quot;, &quot;per&quot;, &quot;perhaps&quot;, &quot;plenty&quot;, &quot;provide&quot;, &quot;quite&quot;, &quot;rather&quot;, &quot;really&quot;, &quot;round&quot;, &quot;said&quot;, &quot;sake&quot;, &quot;same&quot;, &quot;sang&quot;, &quot;save&quot;, &quot;saw&quot;, &quot;see&quot;, &quot;seeing&quot;, &quot;seem&quot;, &quot;seemed&quot;, &quot;seeming&quot;, &quot;seems&quot;, &quot;seen&quot;, &quot;seldom&quot;, &quot;selves&quot;, &quot;sent&quot;, &quot;several&quot;, &quot;shalt&quot;, &quot;she&quot;, &quot;should&quot;, &quot;shown&quot;, &quot;sideways&quot;, &quot;since&quot;, &quot;slept&quot;, &quot;slew&quot;, &quot;slung&quot;, &quot;slunk&quot;, &quot;smote&quot;, &quot;so&quot;, &quot;some&quot;, &quot;somebody&quot;, &quot;somehow&quot;, &quot;someone&quot;, &quot;something&quot;, &quot;sometime&quot;, &quot;sometimes&quot;, &quot;somewhat&quot;, &quot;somewhere&quot;, &quot;spake&quot;, &quot;spat&quot;, &quot;spoke&quot;, &quot;spoken&quot;, &quot;sprang&quot;, &quot;sprung&quot;, &quot;stave&quot;, &quot;staves&quot;, &quot;still&quot;, &quot;such&quot;, &quot;supposing&quot;, &quot;than&quot;, &quot;that&quot;, &quot;the&quot;, &quot;thee&quot;, &quot;their&quot;, &quot;them&quot;, &quot;themselves&quot;, &quot;then&quot;, &quot;thence&quot;, &quot;thenceforth&quot;, &quot;there&quot;, &quot;thereabout&quot;, &quot;thereabouts&quot;, &quot;thereafter&quot;, &quot;thereby&quot;, &quot;therefore&quot;, &quot;therein&quot;, &quot;thereof&quot;, &quot;thereon&quot;, &quot;thereto&quot;, &quot;thereupon&quot;, &quot;these&quot;, &quot;they&quot;, &quot;this&quot;, &quot;those&quot;, &quot;thou&quot;, &quot;though&quot;, &quot;thrice&quot;, &quot;through&quot;, &quot;throughout&quot;, &quot;thru&quot;, &quot;thus&quot;, &quot;thy&quot;, &quot;thyself&quot;, &quot;till&quot;, &quot;to&quot;, &quot;together&quot;, &quot;too&quot;, &quot;toward&quot;, &quot;towards&quot;, &quot;ugh&quot;, &quot;unable&quot;, &quot;under&quot;, &quot;underneath&quot;, &quot;unless&quot;, &quot;unlike&quot;, &quot;until&quot;, &quot;up&quot;, &quot;upon&quot;, &quot;upward&quot;, &quot;upwards&quot;, &quot;us&quot;, &quot;use&quot;, &quot;used&quot;, &quot;using&quot;, &quot;very&quot;, &quot;via&quot;, &quot;vs&quot;, &quot;want&quot;, &quot;was&quot;, &quot;we&quot;, &quot;week&quot;, &quot;well&quot;, &quot;were&quot;, &quot;what&quot;, &quot;whatever&quot;, &quot;whatsoever&quot;, &quot;when&quot;, &quot;whence&quot;, &quot;whenever&quot;, &quot;whensoever&quot;, &quot;where&quot;, &quot;whereabouts&quot;, &quot;whereafter&quot;, &quot;whereas&quot;, &quot;whereat&quot;, &quot;whereby&quot;, &quot;wherefore&quot;, &quot;wherefrom&quot;, &quot;wherein&quot;, &quot;whereinto&quot;, &quot;whereof&quot;, &quot;whereon&quot;, &quot;wheresoever&quot;, &quot;whereto&quot;, &quot;whereunto&quot;, &quot;whereupon&quot;, &quot;wherever&quot;, &quot;wherewith&quot;, &quot;whether&quot;, &quot;whew&quot;, &quot;which&quot;, &quot;whichever&quot;, &quot;whichsoever&quot;, &quot;while&quot;, &quot;whilst&quot;, &quot;whither&quot;, &quot;who&quot;, &quot;whoa&quot;, &quot;whoever&quot;, &quot;whole&quot;, &quot;whom&quot;, &quot;whomever&quot;, &quot;whomsoever&quot;, &quot;whose&quot;, &quot;whosoever&quot;, &quot;why&quot;, &quot;will&quot;, &quot;wilt&quot;, &quot;with&quot;, &quot;within&quot;, &quot;without&quot;, &quot;worse&quot;, &quot;worst&quot;, &quot;would&quot;, &quot;wow&quot;, &quot;ye&quot;, &quot;yet&quot;, &quot;year&quot;, &quot;yippee&quot;, &quot;you&quot;, &quot;your&quot;, &quot;yours&quot;, &quot;yourself&quot;, &quot;yourselves&quot; ]</td>
122
123 <td>&nbsp;</td>
124 <td class="context-item-desc">
125 These are the default stopwords provided by Lemur.
126
127 </td>
128
129 </tr>
130
131 </table>
132 </div>
133 </div>
134
135
136
137
138 <!-- if method_list -->
139
140
141
142
143 </div>
144
145 <div id="validator-badges">
146 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
147 </div>
148
149 </body>
150 </html>
151
doc/classes/Mirimiri/Document.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>Class: Mirimiri::Document [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="classHeader">
46 <table class="header-table">
47 <tr class="top-aligned-row">
48 <td><strong>Class</strong></td>
49 <td class="class-name-in-header">Mirimiri::Document</td>
50 </tr>
51 <tr class="top-aligned-row">
52 <td><strong>In:</strong></td>
53 <td>
54
55
56 <a href="../../files/lib/mirimiri/document_rb.html">
57
58 lib/mirimiri/document.rb
59
60 </a>
61
62
63 <br />
64
65 </td>
66 </tr>
67
68
69 <tr class="top-aligned-row">
70 <td><strong>Parent:</strong></td>
71 <td>
72
73 Object
74
75 </td>
76 </tr>
77
78 </table>
79 </div>
80 <!-- banner header -->
81
82 <div id="bodyContent">
83
84 <div id="contextContent">
85
86 <div id="description">
87 <p>
88 A <a href="Document.html">Document</a> is a bag of words and is constructed
89 from a string.
90 </p>
91
92 </div>
93
94 </div>
95
96
97 <div id="method-list">
98 <h3 class="section-bar">Methods</h3>
99
100 <div class="name-list">
101
102 <a href="#M000024">count_words</a>&nbsp;&nbsp;
103
104 <a href="#M000025">entropy</a>&nbsp;&nbsp;
105
106 <a href="#M000022">format_words</a>&nbsp;&nbsp;
107
108 <a href="#M000027">new</a>&nbsp;&nbsp;
109
110 <a href="#M000023">ngrams</a>&nbsp;&nbsp;
111
112 <a href="#M000026">tf</a>&nbsp;&nbsp;
113
114 </div>
115 </div>
116
117 </div>
118
119 <!-- if includes -->
120
121 <div id="section">
122
123
124
125 <div id="attribute-list">
126 <h3 class="section-bar">Attributes</h3>
127
128 <div class="name-list">
129 <table>
130
131 <tr class="top-aligned-row context-row">
132 <td class="context-item-name">doc_content</td>
133
134 <td class="context-item-value">&nbsp;[R]&nbsp;</td>
135
136 <td class="context-item-desc"></td>
137 </tr>
138
139 <tr class="top-aligned-row context-row">
140 <td class="context-item-name">words</td>
141
142 <td class="context-item-value">&nbsp;[R]&nbsp;</td>
143
144 <td class="context-item-desc"></td>
145 </tr>
146
147 </table>
148 </div>
149 </div>
150
151
152 <!-- if method_list -->
153
154 <div id="methods">
155
156 <h3 class="section-bar">Public Class methods</h3>
157
158
159 <div id="method-M000027" class="method-detail">
160 <a name="M000027"></a>
161
162 <div class="method-heading">
163
164 <a href="Document.src/M000027.html" target="Code" class="method-signature"
165 onclick="popupCode('Document.src/M000027.html');return false;">
166
167 <span class="method-name">new</span><span class="method-args">(content)</span>
168
169 </a>
170
171 </div>
172
173 <div class="method-description">
174
175 </div>
176 </div>
177
178
179 <h3 class="section-bar">Public Instance methods</h3>
180
181
182 <div id="method-M000024" class="method-detail">
183 <a name="M000024"></a>
184
185 <div class="method-heading">
186
187 <a href="Document.src/M000024.html" target="Code" class="method-signature"
188 onclick="popupCode('Document.src/M000024.html');return false;">
189
190 <span class="method-name">count_words</span><span class="method-args">()</span>
191
192 </a>
193
194 </div>
195
196 <div class="method-description">
197
198 <p>
199 Returns a Hash containing the words and their associated counts in the
200 current <a href="Document.html">Document</a>.
201 </p>
202 <pre>
203 count_words #=&gt; { &quot;guitar&quot;=&gt;1, &quot;bass&quot;=&gt;3, &quot;album&quot;=&gt;20, ... }
204 </pre>
205
206 </div>
207 </div>
208
209
210 <div id="method-M000025" class="method-detail">
211 <a name="M000025"></a>
212
213 <div class="method-heading">
214
215 <a href="Document.src/M000025.html" target="Code" class="method-signature"
216 onclick="popupCode('Document.src/M000025.html');return false;">
217
218 <span class="method-name">entropy</span><span class="method-args">(s)</span>
219
220 </a>
221
222 </div>
223
224 <div class="method-description">
225
226 <p>
227 Computes the entropy of a given string <tt>s</tt> inside the document.
228 </p>
229 <p>
230 If the string parameter is composed of many words (i.e. tokens separated by
231 whitespace(s)), it is considered as an ngram.
232 </p>
233 <pre>
234 entropy(&quot;guitar&quot;) #=&gt; 0.00432114812727959
235 entropy(&quot;dillinger escape plan&quot;) #=&gt; 0.265862076325102
236 </pre>
237
238 </div>
239 </div>
240
241
242 <div id="method-M000023" class="method-detail">
243 <a name="M000023"></a>
244
245 <div class="method-heading">
246
247 <a href="Document.src/M000023.html" target="Code" class="method-signature"
248 onclick="popupCode('Document.src/M000023.html');return false;">
249
250 <span class="method-name">ngrams</span><span class="method-args">(n)</span>
251
252 </a>
253
254 </div>
255
256 <div class="method-description">
257
258 <p>
259 Returns an Array containing the <tt>n</tt>-grams (words) from the current
260 <a href="Document.html">Document</a>.
261 </p>
262 <pre>
263 ngrams(2) #=&gt; [&quot;the free&quot;, &quot;free encyclopedia&quot;, &quot;encyclopedia var&quot;, &quot;var skin&quot;, ...]
264 </pre>
265
266 </div>
267 </div>
268
269
270 <div id="method-M000026" class="method-detail">
271 <a name="M000026"></a>
272
273 <div class="method-heading">
274
275 <a href="Document.src/M000026.html" target="Code" class="method-signature"
276 onclick="popupCode('Document.src/M000026.html');return false;">
277
278 <span class="method-name">tf</span><span class="method-args">(s)</span>
279
280 </a>
281
282 </div>
283
284 <div class="method-description">
285
286 <p>
287 Computes the term frequency of a given <b>word</b> <tt>s</tt>.
288 </p>
289 <pre>
290 tf(&quot;guitar&quot;) #=&gt; 0.000380372765310004
291 </pre>
292
293 </div>
294 </div>
295
296
297 <h3 class="section-bar">Protected Instance methods</h3>
298
299
300 <div id="method-M000022" class="method-detail">
301 <a name="M000022"></a>
302
303 <div class="method-heading">
304
305 <a href="Document.src/M000022.html" target="Code" class="method-signature"
306 onclick="popupCode('Document.src/M000022.html');return false;">
307
308 <span class="method-name">format_words</span><span class="method-args">()</span>
309
310 </a>
311
312 </div>
313
314 <div class="method-description">
315
316 <p>
317 Any non-word characters are removed from the words (see <a
318 href="http://perldoc.perl.org/perlre.html">perldoc.perl.org/perlre.html</a>
319 and the W special escape).
320 </p>
321 <p>
322 Protected function, only meant to by called at the initialization.
323 </p>
324
325 </div>
326 </div>
327
328
329
330 </div>
331
332
333
334
335 </div>
336
337 <div id="validator-badges">
338 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
339 </div>
340
341 </body>
342 </html>
343
doc/classes/Mirimiri/Document.src/M000022.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>format_words (Mirimiri::Document)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 34</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">format_words</span>
12 <span class="ruby-identifier">wo</span> = []
13
14 <span class="ruby-ivar">@doc_content</span>.<span class="ruby-identifier">split</span>.<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span>
15 <span class="ruby-identifier">w</span>.<span class="ruby-identifier">split</span>(<span class="ruby-regexp re">/\W/</span>).<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">sw</span><span class="ruby-operator">|</span>
16 <span class="ruby-identifier">wo</span>.<span class="ruby-identifier">push</span>(<span class="ruby-identifier">sw</span>.<span class="ruby-identifier">downcase</span>) <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">sw</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/[a-zA-Z]/</span>
17 <span class="ruby-keyword kw">end</span>
18 <span class="ruby-keyword kw">end</span>
19
20 <span class="ruby-identifier">wo</span>
21 <span class="ruby-keyword kw">end</span></pre>
22 </body>
23 </html>
24
doc/classes/Mirimiri/Document.src/M000023.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>ngrams (Mirimiri::Document)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 49</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">ngrams</span>(<span class="ruby-identifier">n</span>)
12 <span class="ruby-identifier">window</span> = []
13 <span class="ruby-identifier">ngrams_array</span> = []
14
15 <span class="ruby-ivar">@words</span>.<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span>
16 <span class="ruby-identifier">window</span>.<span class="ruby-identifier">push</span>(<span class="ruby-identifier">w</span>)
17 <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">window</span>.<span class="ruby-identifier">size</span> <span class="ruby-operator">==</span> <span class="ruby-identifier">n</span>
18 <span class="ruby-identifier">ngrams_array</span>.<span class="ruby-identifier">push</span> <span class="ruby-identifier">window</span>.<span class="ruby-identifier">join</span>(<span class="ruby-value str">&quot; &quot;</span>)
19 <span class="ruby-identifier">window</span>.<span class="ruby-identifier">delete_at</span>(<span class="ruby-value">0</span>)
20 <span class="ruby-keyword kw">end</span>
21 <span class="ruby-keyword kw">end</span>
22
23 <span class="ruby-identifier">ngrams_array</span>.<span class="ruby-identifier">uniq</span>
24 <span class="ruby-keyword kw">end</span></pre>
25 </body>
26 </html>
27
doc/classes/Mirimiri/Document.src/M000024.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>count_words (Mirimiri::Document)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 67</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">count_words</span>
12 <span class="ruby-identifier">counts</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">h</span>,<span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">h</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-value">0</span> }
13 <span class="ruby-ivar">@words</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span> <span class="ruby-identifier">counts</span>[<span class="ruby-identifier">w</span>] <span class="ruby-operator">+=</span> <span class="ruby-value">1</span> }
14
15 <span class="ruby-identifier">counts</span>
16 <span class="ruby-keyword kw">end</span></pre>
17 </body>
18 </html>
19
doc/classes/Mirimiri/Document.src/M000025.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>entropy (Mirimiri::Document)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 81</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">entropy</span>(<span class="ruby-identifier">s</span>)
12 <span class="ruby-identifier">en</span> = <span class="ruby-value">0</span><span class="ruby-value">.0</span>
13 <span class="ruby-identifier">counts</span> = <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">count_words</span>
14
15 <span class="ruby-identifier">s</span>.<span class="ruby-identifier">split</span>.<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span>
16 <span class="ruby-identifier">p_wi</span> = <span class="ruby-identifier">counts</span>[<span class="ruby-identifier">w</span>].<span class="ruby-identifier">to_f</span><span class="ruby-operator">/</span><span class="ruby-ivar">@words</span>.<span class="ruby-identifier">count</span>.<span class="ruby-identifier">to_f</span>
17 <span class="ruby-identifier">en</span> <span class="ruby-operator">+=</span> <span class="ruby-identifier">p_wi</span><span class="ruby-operator">*</span><span class="ruby-constant">Math</span>.<span class="ruby-identifier">log2</span>(<span class="ruby-identifier">p_wi</span>)
18 <span class="ruby-keyword kw">end</span>
19
20 <span class="ruby-identifier">en</span> <span class="ruby-operator">*=</span> <span class="ruby-value">-1</span>
21 <span class="ruby-identifier">en</span>
22 <span class="ruby-keyword kw">end</span></pre>
23 </body>
24 </html>
25
doc/classes/Mirimiri/Document.src/M000026.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>tf (Mirimiri::Document)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 97</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">tf</span>(<span class="ruby-identifier">s</span>)
12 <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">count_words</span>[<span class="ruby-identifier">s</span>].<span class="ruby-identifier">to_f</span><span class="ruby-operator">/</span><span class="ruby-ivar">@words</span>.<span class="ruby-identifier">size</span>.<span class="ruby-identifier">to_f</span>
13 <span class="ruby-keyword kw">end</span></pre>
14 </body>
15 </html>
16
doc/classes/Mirimiri/Document.src/M000027.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>new (Mirimiri::Document)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 102</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">content</span>)
12 <span class="ruby-ivar">@doc_content</span> = <span class="ruby-identifier">content</span>
13 <span class="ruby-ivar">@words</span> = <span class="ruby-identifier">format_words</span>
14 <span class="ruby-keyword kw">end</span></pre>
15 </body>
16 </html>
17
doc/classes/Mirimiri/WebDocument.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>Class: Mirimiri::WebDocument [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="classHeader">
46 <table class="header-table">
47 <tr class="top-aligned-row">
48 <td><strong>Class</strong></td>
49 <td class="class-name-in-header">Mirimiri::WebDocument</td>
50 </tr>
51 <tr class="top-aligned-row">
52 <td><strong>In:</strong></td>
53 <td>
54
55
56 <a href="../../files/lib/mirimiri/document_rb.html">
57
58 lib/mirimiri/document.rb
59
60 </a>
61
62
63 <br />
64
65 </td>
66 </tr>
67
68
69 <tr class="top-aligned-row">
70 <td><strong>Parent:</strong></td>
71 <td>
72
73 <a href="Document.html">
74
75 Mirimiri::Document
76
77 </a>
78
79 </td>
80 </tr>
81
82 </table>
83 </div>
84 <!-- banner header -->
85
86 <div id="bodyContent">
87
88 <div id="contextContent">
89
90 <div id="description">
91 <p>
92 A <a href="WebDocument.html">WebDocument</a> is a <a
93 href="Document.html">Document</a> with a <tt>url</tt>.
94 </p>
95
96 </div>
97
98 </div>
99
100
101 <div id="method-list">
102 <h3 class="section-bar">Methods</h3>
103
104 <div class="name-list">
105
106 <a href="#M000028">get_content</a>&nbsp;&nbsp;
107
108 <a href="#M000029">new</a>&nbsp;&nbsp;
109
110 </div>
111 </div>
112
113 </div>
114
115 <!-- if includes -->
116
117 <div id="section">
118
119
120
121 <div id="attribute-list">
122 <h3 class="section-bar">Attributes</h3>
123
124 <div class="name-list">
125 <table>
126
127 <tr class="top-aligned-row context-row">
128 <td class="context-item-name">url</td>
129
130 <td class="context-item-value">&nbsp;[R]&nbsp;</td>
131
132 <td class="context-item-desc"></td>
133 </tr>
134
135 </table>
136 </div>
137 </div>
138
139
140 <!-- if method_list -->
141
142 <div id="methods">
143
144 <h3 class="section-bar">Public Class methods</h3>
145
146
147 <div id="method-M000028" class="method-detail">
148 <a name="M000028"></a>
149
150 <div class="method-heading">
151
152 <a href="WebDocument.src/M000028.html" target="Code" class="method-signature"
153 onclick="popupCode('WebDocument.src/M000028.html');return false;">
154
155 <span class="method-name">get_content</span><span class="method-args">(url)</span>
156
157 </a>
158
159 </div>
160
161 <div class="method-description">
162
163 <p>
164 Returns the HTML text from the page of a given <tt>url</tt>.
165 </p>
166
167 </div>
168 </div>
169
170
171 <div id="method-M000029" class="method-detail">
172 <a name="M000029"></a>
173
174 <div class="method-heading">
175
176 <a href="WebDocument.src/M000029.html" target="Code" class="method-signature"
177 onclick="popupCode('WebDocument.src/M000029.html');return false;">
178
179 <span class="method-name">new</span><span class="method-args">(url)</span>
180
181 </a>
182
183 </div>
184
185 <div class="method-description">
186
187 <p>
188 <a href="WebDocument.html">WebDocument</a> constructor, the content of the
189 <a href="Document.html">Document</a> is the HTML page without the tags.
190 </p>
191
192 </div>
193 </div>
194
195
196
197 </div>
198
199
200
201
202 </div>
203
204 <div id="validator-badges">
205 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
206 </div>
207
208 </body>
209 </html>
210
doc/classes/Mirimiri/WebDocument.src/M000028.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>get_content (Mirimiri::WebDocument)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 115</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">get_content</span>(<span class="ruby-identifier">url</span>)
12 <span class="ruby-identifier">require</span> <span class="ruby-value str">'net/http'</span>
13 <span class="ruby-constant">Net</span><span class="ruby-operator">::</span><span class="ruby-constant">HTTP</span>.<span class="ruby-identifier">get</span>(<span class="ruby-constant">URI</span>.<span class="ruby-identifier">parse</span>(<span class="ruby-identifier">url</span>))
14 <span class="ruby-keyword kw">end</span></pre>
15 </body>
16 </html>
17
doc/classes/Mirimiri/WebDocument.src/M000029.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>new (Mirimiri::WebDocument)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 122</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">url</span>)
12 <span class="ruby-ivar">@url</span> = <span class="ruby-identifier">url</span>
13 <span class="ruby-keyword kw">super</span> <span class="ruby-constant">WebDocument</span>.<span class="ruby-identifier">get_content</span>(<span class="ruby-identifier">url</span>).<span class="ruby-identifier">strip_javascripts</span>.<span class="ruby-identifier">strip_stylesheets</span>.<span class="ruby-identifier">strip_xml_tags</span>
14 <span class="ruby-keyword kw">end</span></pre>
15 </body>
16 </html>
17
doc/classes/Mirimiri/WikipediaPage.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>Class: Mirimiri::WikipediaPage [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="classHeader">
46 <table class="header-table">
47 <tr class="top-aligned-row">
48 <td><strong>Class</strong></td>
49 <td class="class-name-in-header">Mirimiri::WikipediaPage</td>
50 </tr>
51 <tr class="top-aligned-row">
52 <td><strong>In:</strong></td>
53 <td>
54
55
56 <a href="../../files/lib/mirimiri/document_rb.html">
57
58 lib/mirimiri/document.rb
59
60 </a>
61
62
63 <br />
64
65 </td>
66 </tr>
67
68
69 <tr class="top-aligned-row">
70 <td><strong>Parent:</strong></td>
71 <td>
72
73 <a href="WebDocument.html">
74
75 Mirimiri::WebDocument
76
77 </a>
78
79 </td>
80 </tr>
81
82 </table>
83 </div>
84 <!-- banner header -->
85
86 <div id="bodyContent">
87
88 <div id="contextContent">
89
90 <div id="description">
91 <p>
92 A <a href="WikipediaPage.html">WikipediaPage</a> is a <a
93 href="WebDocument.html">WebDocument</a>.
94 </p>
95
96 </div>
97
98 </div>
99
100
101 <div id="method-list">
102 <h3 class="section-bar">Methods</h3>
103
104 <div class="name-list">
105
106 <a href="#M000031">get_url</a>&nbsp;&nbsp;
107
108 <a href="#M000032">search_homepage</a>&nbsp;&nbsp;
109
110 <a href="#M000030">search_wikipedia_titles</a>&nbsp;&nbsp;
111
112 </div>
113 </div>
114
115 </div>
116
117 <!-- if includes -->
118
119 <div id="section">
120
121
122
123
124 <!-- if method_list -->
125
126 <div id="methods">
127
128 <h3 class="section-bar">Public Class methods</h3>
129
130
131 <div id="method-M000031" class="method-detail">
132 <a name="M000031"></a>
133
134 <div class="method-heading">
135
136 <a href="WikipediaPage.src/M000031.html" target="Code" class="method-signature"
137 onclick="popupCode('WikipediaPage.src/M000031.html');return false;">
138
139 <span class="method-name">get_url</span><span class="method-args">(name)</span>
140
141 </a>
142
143 </div>
144
145 <div class="method-description">
146
147 </div>
148 </div>
149
150
151 <div id="method-M000032" class="method-detail">
152 <a name="M000032"></a>
153
154 <div class="method-heading">
155
156 <a href="WikipediaPage.src/M000032.html" target="Code" class="method-signature"
157 onclick="popupCode('WikipediaPage.src/M000032.html');return false;">
158
159 <span class="method-name">search_homepage</span><span class="method-args">(name)</span>
160
161 </a>
162
163 </div>
164
165 <div class="method-description">
166
167 </div>
168 </div>
169
170
171 <div id="method-M000030" class="method-detail">
172 <a name="M000030"></a>
173
174 <div class="method-heading">
175
176 <a href="WikipediaPage.src/M000030.html" target="Code" class="method-signature"
177 onclick="popupCode('WikipediaPage.src/M000030.html');return false;">
178
179 <span class="method-name">search_wikipedia_titles</span><span class="method-args">(name)</span>
180
181 </a>
182
183 </div>
184
185 <div class="method-description">
186
187 </div>
188 </div>
189
190
191
192 </div>
193
194
195
196
197 </div>
198
199 <div id="validator-badges">
200 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
201 </div>
202
203 </body>
204 </html>
205
doc/classes/Mirimiri/WikipediaPage.src/M000030.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>search_wikipedia_titles (Mirimiri::WikipediaPage)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 135</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">search_wikipedia_titles</span>(<span class="ruby-identifier">name</span>)
12 <span class="ruby-identifier">raise</span> <span class="ruby-constant">ArgumentError</span>, <span class="ruby-value str">&quot;Bad encoding&quot;</span>, <span class="ruby-identifier">name</span> <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">name</span>.<span class="ruby-identifier">isutf8</span>
13
14 <span class="ruby-identifier">res</span> = <span class="ruby-constant">REXML</span><span class="ruby-operator">::</span><span class="ruby-constant">Document</span>.<span class="ruby-identifier">new</span>(<span class="ruby-constant">Net</span><span class="ruby-operator">::</span><span class="ruby-constant">HTTP</span>.<span class="ruby-identifier">get</span>( <span class="ruby-constant">URI</span>.<span class="ruby-identifier">parse</span> <span class="ruby-node">&quot;http://en.wikipedia.org/w/api.php?action=query&amp;list=search&amp;srsearch=#{URI.escape name}&amp;format=xml&quot;</span> ).<span class="ruby-identifier">toutf8</span>).<span class="ruby-identifier">elements</span>[<span class="ruby-value str">'api/query/search'</span>]
15
16 <span class="ruby-identifier">res</span>.<span class="ruby-identifier">collect</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">e</span><span class="ruby-operator">|</span> <span class="ruby-identifier">e</span>.<span class="ruby-identifier">attributes</span>[<span class="ruby-value str">'title'</span>] } <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">res</span>.<span class="ruby-identifier">nil?</span>
17 <span class="ruby-keyword kw">end</span></pre>
18 </body>
19 </html>
20
doc/classes/Mirimiri/WikipediaPage.src/M000031.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>get_url (Mirimiri::WikipediaPage)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 143</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">get_url</span>(<span class="ruby-identifier">name</span>)
12 <span class="ruby-identifier">raise</span> <span class="ruby-constant">ArgumentError</span>, <span class="ruby-value str">&quot;Bad encoding&quot;</span>, <span class="ruby-identifier">name</span> <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">name</span>.<span class="ruby-identifier">isutf8</span>
13
14 <span class="ruby-identifier">atts</span> = <span class="ruby-constant">REXML</span><span class="ruby-operator">::</span><span class="ruby-constant">Document</span>.<span class="ruby-identifier">new</span>(<span class="ruby-constant">Net</span><span class="ruby-operator">::</span><span class="ruby-constant">HTTP</span>.<span class="ruby-identifier">get</span>( <span class="ruby-constant">URI</span>.<span class="ruby-identifier">parse</span> <span class="ruby-node">&quot;http://en.wikipedia.org/w/api.php?action=query&amp;titles=#{URI.escape name}&amp;inprop=url&amp;prop=info&amp;format=xml&quot;</span> ).<span class="ruby-identifier">toutf8</span>).<span class="ruby-identifier">elements</span>[<span class="ruby-value str">'api/query/pages/page'</span>].<span class="ruby-identifier">attributes</span>
15
16 <span class="ruby-identifier">atts</span>[<span class="ruby-value str">'fullurl'</span>] <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">atts</span>[<span class="ruby-value str">'missing'</span>].<span class="ruby-identifier">nil?</span>
17 <span class="ruby-keyword kw">end</span></pre>
18 </body>
19 </html>
20
doc/classes/Mirimiri/WikipediaPage.src/M000032.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>search_homepage (Mirimiri::WikipediaPage)</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
8 </head>
9 <body class="standalone-code">
10 <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 151</span>
11 <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">search_homepage</span>(<span class="ruby-identifier">name</span>)
12 <span class="ruby-identifier">title</span> = <span class="ruby-constant">WikipediaPage</span>.<span class="ruby-identifier">search_wikipedia_titles</span> <span class="ruby-identifier">name</span>
13
14 <span class="ruby-constant">WikipediaPage</span>.<span class="ruby-identifier">new</span>(<span class="ruby-constant">WikipediaPage</span>.<span class="ruby-identifier">get_url</span> <span class="ruby-identifier">title</span>[<span class="ruby-value">0</span>]) <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">title</span>.<span class="ruby-identifier">nil?</span> <span class="ruby-operator">||</span> <span class="ruby-identifier">title</span>.<span class="ruby-identifier">empty?</span>
15 <span class="ruby-keyword kw">end</span></pre>
16 </body>
17 </html>
18
doc/files/lib/mirimiri/corpus_rb.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>File: corpus.rb [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="fileHeader">
46 <h1>corpus.rb</h1>
47 <table class="header-table">
48 <tr class="top-aligned-row">
49 <td><strong>Path:</strong></td>
50 <td>lib/mirimiri/corpus.rb
51
52 </td>
53 </tr>
54 <tr class="top-aligned-row">
55 <td><strong>Last Update:</strong></td>
56 <td>2010-12-20 10:35:26 +0100</td>
57 </tr>
58 </table>
59 </div>
60 <!-- banner header -->
61
62 <div id="bodyContent">
63
64 <div id="contextContent">
65
66 </div>
67
68
69 </div>
70
71 <!-- if includes -->
72
73 <div id="section">
74
75
76
77
78 <!-- if method_list -->
79
80
81
82
83 </div>
84
85 <div id="validator-badges">
86 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
87 </div>
88
89 </body>
90 </html>
91
doc/files/lib/mirimiri/document_rb.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>File: document.rb [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="fileHeader">
46 <h1>document.rb</h1>
47 <table class="header-table">
48 <tr class="top-aligned-row">
49 <td><strong>Path:</strong></td>
50 <td>lib/mirimiri/document.rb
51
52 </td>
53 </tr>
54 <tr class="top-aligned-row">
55 <td><strong>Last Update:</strong></td>
56 <td>2010-12-20 10:36:07 +0100</td>
57 </tr>
58 </table>
59 </div>
60 <!-- banner header -->
61
62 <div id="bodyContent">
63
64 <div id="contextContent">
65
66 <div id="requires-list">
67 <h3 class="section-bar">Required files</h3>
68
69 <div class="name-list">
70
71 net/http&nbsp;&nbsp;
72
73 rexml/document&nbsp;&nbsp;
74
75 net/http&nbsp;&nbsp;
76
77 kconv&nbsp;&nbsp;
78
79 </div>
80 </div>
81
82 </div>
83
84
85 </div>
86
87 <!-- if includes -->
88
89 <div id="section">
90
91
92
93
94 <!-- if method_list -->
95
96
97
98
99 </div>
100
101 <div id="validator-badges">
102 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
103 </div>
104
105 </body>
106 </html>
107
doc/files/lib/mirimiri/query_rb.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>File: query.rb [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="fileHeader">
46 <h1>query.rb</h1>
47 <table class="header-table">
48 <tr class="top-aligned-row">
49 <td><strong>Path:</strong></td>
50 <td>lib/mirimiri/query.rb
51
52 </td>
53 </tr>
54 <tr class="top-aligned-row">
55 <td><strong>Last Update:</strong></td>
56 <td>2010-12-20 10:36:27 +0100</td>
57 </tr>
58 </table>
59 </div>
60 <!-- banner header -->
61
62 <div id="bodyContent">
63
64 <div id="contextContent">
65
66 </div>
67
68
69 </div>
70
71 <!-- if includes -->
72
73 <div id="section">
74
75
76
77
78 <!-- if method_list -->
79
80
81
82
83 </div>
84
85 <div id="validator-badges">
86 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
87 </div>
88
89 </body>
90 </html>
91
doc/files/lib/mirimiri/regexp_rb.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>File: regexp.rb [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="fileHeader">
46 <h1>regexp.rb</h1>
47 <table class="header-table">
48 <tr class="top-aligned-row">
49 <td><strong>Path:</strong></td>
50 <td>lib/mirimiri/regexp.rb
51
52 </td>
53 </tr>
54 <tr class="top-aligned-row">
55 <td><strong>Last Update:</strong></td>
56 <td>2010-12-20 10:36:42 +0100</td>
57 </tr>
58 </table>
59 </div>
60 <!-- banner header -->
61
62 <div id="bodyContent">
63
64 <div id="contextContent">
65
66 </div>
67
68
69 </div>
70
71 <!-- if includes -->
72
73 <div id="section">
74
75
76
77
78 <!-- if method_list -->
79
80
81
82
83 </div>
84
85 <div id="validator-badges">
86 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
87 </div>
88
89 </body>
90 </html>
91
doc/files/lib/mirimiri/string_rb.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>File: string.rb [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="fileHeader">
46 <h1>string.rb</h1>
47 <table class="header-table">
48 <tr class="top-aligned-row">
49 <td><strong>Path:</strong></td>
50 <td>lib/mirimiri/string.rb
51
52 </td>
53 </tr>
54 <tr class="top-aligned-row">
55 <td><strong>Last Update:</strong></td>
56 <td>2010-12-20 10:37:16 +0100</td>
57 </tr>
58 </table>
59 </div>
60 <!-- banner header -->
61
62 <div id="bodyContent">
63
64 <div id="contextContent">
65
66 <div id="description">
67 <hr size="1"></hr><p>
68 General module
69 </p>
70
71 </div>
72
73 <div id="requires-list">
74 <h3 class="section-bar">Required files</h3>
75
76 <div class="name-list">
77
78 cgi&nbsp;&nbsp;
79
80 kconv&nbsp;&nbsp;
81
82 </div>
83 </div>
84
85 </div>
86
87
88 </div>
89
90 <!-- if includes -->
91
92 <div id="section">
93
94
95
96
97 <!-- if method_list -->
98
99
100
101
102 </div>
103
104 <div id="validator-badges">
105 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
106 </div>
107
108 </body>
109 </html>
110
doc/files/lib/mirimiri/ttagger_rb.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>File: ttagger.rb [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="fileHeader">
46 <h1>ttagger.rb</h1>
47 <table class="header-table">
48 <tr class="top-aligned-row">
49 <td><strong>Path:</strong></td>
50 <td>lib/mirimiri/ttagger.rb
51
52 </td>
53 </tr>
54 <tr class="top-aligned-row">
55 <td><strong>Last Update:</strong></td>
56 <td>2010-12-20 10:37:32 +0100</td>
57 </tr>
58 </table>
59 </div>
60 <!-- banner header -->
61
62 <div id="bodyContent">
63
64 <div id="contextContent">
65
66 </div>
67
68
69 </div>
70
71 <!-- if includes -->
72
73 <div id="section">
74
75
76
77
78 <!-- if method_list -->
79
80
81
82
83 </div>
84
85 <div id="validator-badges">
86 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
87 </div>
88
89 </body>
90 </html>
91
doc/files/lib/mirimiri_rb.html
File was created 1 <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
2 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
3 <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
4 <head>
5 <title>File: mirimiri.rb [RDoc Documentation]</title>
6 <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
7 <meta http-equiv="Content-Script-Type" content="text/javascript" />
8 <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" />
9 <script type="text/javascript">
10 // <![CDATA[
11
12 function popupCode( url ) {
13 window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400")
14 }
15
16 function toggleCode( id ) {
17 if ( document.getElementById )
18 elem = document.getElementById( id );
19 else if ( document.all )
20 elem = eval( "document.all." + id );
21 else
22 return false;
23
24 elemStyle = elem.style;
25
26 if ( elemStyle.display != "block" ) {
27 elemStyle.display = "block"
28 } else {
29 elemStyle.display = "none"
30 }
31
32 return true;
33 }
34
35 // Make codeblocks hidden by default
36 document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" )
37
38 // ]]>
39 </script>
40
41 </head>
42 <body>
43
44
45 <div id="fileHeader">
46 <h1>mirimiri.rb</h1>
47 <table class="header-table">
48 <tr class="top-aligned-row">
49 <td><strong>Path:</strong></td>
50 <td>lib/mirimiri.rb
51
52 </td>
53 </tr>
54 <tr class="top-aligned-row">
55 <td><strong>Last Update:</strong></td>
56 <td>2010-12-20 10:33:51 +0100</td>
57 </tr>
58 </table>
59 </div>
60 <!-- banner header -->
61
62 <div id="bodyContent">
63
64 <div id="contextContent">
65
66 <div id="requires-list">
67 <h3 class="section-bar">Required files</h3>
68
69 <div class="name-list">
70
71 rir/document&nbsp;&nbsp;
72
73 rir/string&nbsp;&nbsp;
74
75 rir/query&nbsp;&nbsp;
76
77 rir/corpus&nbsp;&nbsp;
78
79 rir/regexp&nbsp;&nbsp;
80
81 rir/ttagger&nbsp;&nbsp;
82
83 </div>
84 </div>
85
86 </div>
87
88
89 </div>
90
91 <!-- if includes -->
92
93 <div id="section">
94
95
96
97
98 <!-- if method_list -->
99
100
101
102
103 </div>
104
105 <div id="validator-badges">
106 <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p>
107 </div>
108
109 </body>
110 </html>
111
File was created 1 #!/usr/bin/env ruby
2
3 require 'mirimiri/document'
4 require 'mirimiri/string'
5 require 'mirimiri/query'
6 require 'mirimiri/corpus'
7 require 'mirimiri/regexp'
8 require 'mirimiri/ttagger'
9
lib/mirimiri/corpus.rb
File was created 1 #!/usr/bin/env ruby
2
3 #--
4 # This file is a part of the mirimiri library
5 #
6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #++
21
22 class Corpus
23 attr_accessor :path
24
25 def initialize(path)
26 @path = path.chomp "/"
27 end
28
29 # Recursively outputs all files in +self.path+.
30 # WARNING ! This function may take a lot of time if many
31 # files are in subdirectories.
32 #
33 # c = Corpus.new "my/path"
34 # c.files # => ["README.txt", "lib/code.rb"]
35 def files
36 Dir["#{@path}/**/*.*"]
37 end
38 end
39
lib/mirimiri/document.rb
File was created 1 #!/usr/bin/env ruby
2
3 #--
4 # This file is a part of the mirimiri library
5 #
6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #++
21
22
23 # General module
24 module Mirimiri
25
26 # A Document is a bag of words and is constructed from a string.
27 class Document
28 attr_reader :words, :doc_content
29
30 # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
31 # and the \\W special escape).
32 #
33 # Protected function, only meant to by called at the initialization.
34 def format_words
35 wo = []
36
37 @doc_content.split.each do |w|
38 w.split(/\W/).each do |sw|
39 wo.push(sw.downcase) if sw =~ /[a-zA-Z]/
40 end
41 end
42
43 wo
44 end
45
46 # Returns an Array containing the +n+-grams (words) from the current Document.
47 #
48 # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
49 def ngrams(n)
50 window = []
51 ngrams_array = []
52
53 @words.each do |w|
54 window.push(w)
55 if window.size == n
56 ngrams_array.push window.join(" ")
57 window.delete_at(0)
58 end
59 end
60
61 ngrams_array.uniq
62 end
63
64 # Returns a Hash containing the words and their associated counts in the current Document.
65 #
66 # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
67 def count_words
68 counts = Hash.new { |h,k| h[k] = 0 }
69 @words.each { |w| counts[w] += 1 }
70
71 counts
72 end
73
74 # Computes the entropy of a given string +s+ inside the document.
75 #
76 # If the string parameter is composed of many words (i.e. tokens separated
77 # by whitespace(s)), it is considered as an ngram.
78 #
79 # entropy("guitar") #=> 0.00432114812727959
80 # entropy("dillinger escape plan") #=> 0.265862076325102
81 def entropy(s)
82 en = 0.0
83 counts = self.count_words
84
85 s.split.each do |w|
86 p_wi = counts[w].to_f/@words.count.to_f
87 en += p_wi*Math.log2(p_wi)
88 end
89
90 en *= -1
91 en
92 end
93
94 # Computes the term frequency of a given *word* +s+.
95 #
96 # tf("guitar") #=> 0.000380372765310004
97 def tf(s)
98 self.count_words[s].to_f/@words.size.to_f
99 end
100
101
102 def initialize(content)
103 @doc_content = content
104 @words = format_words
105 end
106
107 protected :format_words
108 end
109
110 # A WebDocument is a Document with a +url+.
111 class WebDocument < Document
112 attr_reader :url
113
114 # Returns the HTML text from the page of a given +url+.
115 def self.get_content(url)
116 require 'net/http'
117 Net::HTTP.get(URI.parse(url))
118 end
119
120 # WebDocument constructor, the content of the Document is the HTML page
121 # without the tags.
122 def initialize(url)
123 @url = url
124 super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags
125 end
126 end
127
128 # A WikipediaPage is a WebDocument.
129 class WikipediaPage < WebDocument
130 require 'rexml/document'
131 require 'net/http'
132 require 'kconv'
133
134
135 def self.search_wikipedia_titles(name)
136 raise ArgumentError, "Bad encoding", name unless name.isutf8
137
138 res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search']
139
140 res.collect { |e| e.attributes['title'] } unless res.nil?
141 end
142
143 def self.get_url(name)
144 raise ArgumentError, "Bad encoding", name unless name.isutf8
145
146 atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes
147
148 atts['fullurl'] if atts['missing'].nil?
149 end
150
151 def self.search_homepage(name)
152 title = WikipediaPage.search_wikipedia_titles name
153
154 WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?
155 end
156
157 # def initialize(name)
158 # title = WikipediaPage.search_wikipedia_titles name
159 # raise ArgumentError, "No page found" if title.empty?
160 # super WikipediaPage.get_url title[0]
161 # end
162 end
163 end
164
lib/mirimiri/query.rb
File was created 1 #!/usr/bin/env ruby
2
3 #--
4 # This file is a part of the mirimiri library
5 #
6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #++
21
22 class Query
23 end
24
25 module Indri
26
27 class Parameters
28 attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline
29
30 def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false)
31 @index_path = corpus
32 @memory = mem
33 @count = count
34 @offset = offset
35 @run_id = run_id
36 @print_query = print_query ? "true" : "false"
37 @print_docs = print_docs ? "true" : "false"
38 end
39
40 def to_s
41 h = "<parameters>\n"
42 h += "<memory>#{@memory}</memory>\n"
43 h += "<index>#{@index_path}</index>\n"
44 h += "<count>#{@count}</count>\n"
45 unless @baseline.nil?
46 h += "<baseline>#{@baseline}</baseline>\n"
47 else
48 h += "<rule>#{@rule}</rule>\n"
49 end
50 h += "<queryOffset>#{@offset}</queryOffset>\n"
51 h += "<runID>#{@run_id}</runID>\n"
52 h += "<printQuery>#{@print_query}</printQuery>\n"
53 h += "<printDocuments>#{@print_docs}</printDocuments>\n"
54
55 h
56 end
57 end
58
59 class IndriQuery < Query
60 attr_accessor :id, :query, :params, :rule
61
62 def initialize(id,query,params)
63 @params = params
64 # Here we set the default retrieval model as Language Modeling
65 # with a Dirichlet smoothing at 2500.
66 # TODO: maybe a Rule class...
67 @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil?
68
69 @id = id
70 @query = query
71 end
72
73 def to_s
74 h = @params.to_s
75 h += "<query>\n"
76 h += "<number>#{@id}</number>\n"
77 h += "<text>#{@query}</text>\n"
78 h += "</query>\n"
79 h += "</parameters>"
80
81 h
82 end
83 end
84
85 end
86
lib/mirimiri/regexp.rb
File was created 1 #!/usr/bin/env ruby
2
3 #--
4 # This file is a part of the mirimiri library
5 #
6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #++
21
22 class Regexp
23
24 def negated
25 /^((?!#{self}).)*$/
26 end
27
28 end
29
lib/mirimiri/string.rb
File was created 1 #!/usr/bin/env ruby
2
3 #--
4 # This file is a part of the mirimiri library
5 #
6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #++
21
22 module Mirimiri
23
24 # These are the default stopwords provided by Lemur.
25 Stoplist = [
26 "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av",
27 "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand",
28 "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by",
29 "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu",
30 "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during",
31 "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every",
32 "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting",
33 "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff",
34 "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore",
35 "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he",
36 "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto",
37 "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto",
38 "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include",
39 "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into",
40 "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last",
41 "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe",
42 "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs",
43 "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless",
44 "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing",
45 "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once",
46 "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours",
47 "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite",
48 "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing",
49 "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt",
50 "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote",
51 "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes",
52 "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave",
53 "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them",
54 "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts",
55 "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon",
56 "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru",
57 "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh",
58 "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward",
59 "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week",
60 "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever",
61 "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore",
62 "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto",
63 "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever",
64 "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom",
65 "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within",
66 "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your",
67 "yours", "yourself", "yourselves"
68 ]
69
70
71 end
72
73 # Extention of the standard class String with useful function.
74 class String
75 include Mirimiri
76
77 # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise.
78 def is_stopword?
79 Stoplist.include?(self.downcase)
80 end
81
82 # Do not use.
83 # TODO: rewamp. find why this function is here.
84 def remove_special_characters
85 self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ')
86 end
87
88 # Removes all XML-like tags from +self+.
89 #
90 # s = "<html><body>test</body></html>"
91 # s.strip_xml_tags!
92 # s #=> "test"
93 def strip_xml_tags!
94 replace strip_with_pattern /<\/?[^>]*>/
95 end
96
97 # Removes all XML-like tags from +self+.
98 #
99 # s = "<html><body>test</body></html>"
100 # s.strip_xml_tags #=> "test"
101 # s #=> "<html><body>test</body></html>"
102 def strip_xml_tags
103 dup.strip_xml_tags!
104 end
105
106 # Removes all Javascript sources from +self+.
107 #
108 # s = "<script type='text/javascript'>
109 # var skin='vector',
110 # stylepath='http://bits.wikimedia.org/skins-1.5'
111 # </script>
112 #
113 # test"
114 # s.strip_javascripts!
115 # s #=> "test"
116 def strip_javascripts!
117 replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m
118 end
119
120 # Removes all Javascript sources from +self+.
121 #
122 # s = "<script type='text/javascript'>
123 # var skin='vector',
124 # stylepath='http://bits.wikimedia.org/skins-1.5'
125 # </script>
126 #
127 # test"
128 # s.strip_javascripts #=> "test"
129 def strip_javascripts
130 dup.strip_javascripts!
131 end
132
133 def strip_stylesheets!
134 # TODO: rewamp. dunno what is it.
135 replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m
136 end
137
138 def strip_stylesheets
139 dup.strip_stylesheets!
140 end
141
142 # Removes punctuation from +self+.
143 #
144 # s = "hello, world. how are you?!"
145 # s.strip_punctuation!
146 # s # => "hello world how are you"
147 def strip_punctuation!
148 replace strip_with_pattern /[^a-zA-Z0-9\-\s]/
149 end
150
151 # Removes punctuation from +self+.
152 #
153 # s = "hello, world. how are you?!"
154 # s.strip_punctuation # => "hello world how are you"
155 def strip_punctuation
156 dup.strip_punctuation!
157 end
158
159 # Returns the text values inside all occurences of a XML tag in +self+
160 #
161 # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre"
162 # s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"]
163 def extract_xmltags_values(tag_name)
164 self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten
165 end
166
167 def strip_with_pattern(pattern)
168 require 'cgi'
169 require 'kconv'
170 CGI::unescapeHTML(self.gsub(pattern,"")).toutf8
171 end
172
173 private :strip_with_pattern
174 end
175
lib/mirimiri/ttagger.rb
File was created 1 #!/usr/bin/env ruby
2
3 #--
4 # This file is a part of the mirimiri library
5 #
6 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7 #
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12 #
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
17 #
18 # You should have received a copy of the GNU General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20 #++
21
22
23 # TreeTagger-related stuff module.
24 #
25 # See http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/DecisionTreeTagger.html
26 module TreeTagger
27
28 # This class handles generic parsing of tagger-chunker outputs.
29 class TaggerChunker
30 attr_reader :chunks, :file
31
32
33 # Parses a tagger-chunker output and returns an Array of Chunk.
34 def self.parse chunk_lines
35 open = false
36 tag = nil
37
38 chunks = []
39 words = []
40
41 chunk_lines.each do |l|
42 l.chomp!
43 if l =~ /^<\w+>$/
44 open = true
45 tag = l
46 elsif l =~ /^<\/\w+>$/
47 if !words.empty? && open && l == tag.sub(/</, '</')
48 open = false
49 chunks.push Chunk.new(words.join(" "), tag)
50 words.clear
51 else
52 next
53 end
54 else
55 words.push(l.split.first)
56 end
57 end
58
59 chunks
60 end
61
62 # Initializes parsing. +chunk_file+ is the output of +tagger-chunker-+ and must
63 # be a valid path to the file.
64 #
65 # TaggerChunker.new("ttout/2010020") #=> #<RIR::TreeTagger::TaggerChunker:0x92fd088 @chunks=[#<RIR::TreeTagger::Chunk:0x8ec5a10 @words=["robert", "schumann"], @tag="NC">, ...] ...>
66 def initialize chunk_file
67 @chunks = TaggerChunker.parse File.open(chunk_file).readlines
68 end
69
70 end
71
72 class TaggerChunkerEnglish < TaggerChunker
73 end
74
75 class TaggerChunkerFrench < TaggerChunker
76 end
77
78 class TaggerChunkerGerman < TaggerChunker
79 end
80
81 # Represents a Chunk extracted when parsing a TaggerChunker file.
82 class Chunk
83 attr_reader :words, :tag
84
85 # Creates a Chunk.
86 #
87 # * +str+ are whitespace-separated terms.
88 # * +tag+ see : ftp://ftp.ims.uni-stuttgart.de/pub/corpora/chunker-tagset-english.txt
89 def initialize str,tag
90 @words = str.split
91 @tag = tag[1..-2]
92 end
93 end
94
95 end
96