Commit cd74322524114fbad147da48f608384d03e46c58
1 parent
b3995017e6
Exists in
master
adding missing files
Showing 29 changed files with 2390 additions and 0 deletions Inline Diff
- doc/classes/Mirimiri.html
- doc/classes/Mirimiri/Document.html
- doc/classes/Mirimiri/Document.src/M000022.html
- doc/classes/Mirimiri/Document.src/M000023.html
- doc/classes/Mirimiri/Document.src/M000024.html
- doc/classes/Mirimiri/Document.src/M000025.html
- doc/classes/Mirimiri/Document.src/M000026.html
- doc/classes/Mirimiri/Document.src/M000027.html
- doc/classes/Mirimiri/WebDocument.html
- doc/classes/Mirimiri/WebDocument.src/M000028.html
- doc/classes/Mirimiri/WebDocument.src/M000029.html
- doc/classes/Mirimiri/WikipediaPage.html
- doc/classes/Mirimiri/WikipediaPage.src/M000030.html
- doc/classes/Mirimiri/WikipediaPage.src/M000031.html
- doc/classes/Mirimiri/WikipediaPage.src/M000032.html
- doc/files/lib/mirimiri/corpus_rb.html
- doc/files/lib/mirimiri/document_rb.html
- doc/files/lib/mirimiri/query_rb.html
- doc/files/lib/mirimiri/regexp_rb.html
- doc/files/lib/mirimiri/string_rb.html
- doc/files/lib/mirimiri/ttagger_rb.html
- doc/files/lib/mirimiri_rb.html
- lib/mirimiri.rb
- lib/mirimiri/corpus.rb
- lib/mirimiri/document.rb
- lib/mirimiri/query.rb
- lib/mirimiri/regexp.rb
- lib/mirimiri/string.rb
- lib/mirimiri/ttagger.rb
doc/classes/Mirimiri.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>Module: Mirimiri [RDoc Documentation]</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <meta http-equiv="Content-Script-Type" content="text/javascript" /> | ||
8 | <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" /> | ||
9 | <script type="text/javascript"> | ||
10 | // <![CDATA[ | ||
11 | |||
12 | function popupCode( url ) { | ||
13 | window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | ||
14 | } | ||
15 | |||
16 | function toggleCode( id ) { | ||
17 | if ( document.getElementById ) | ||
18 | elem = document.getElementById( id ); | ||
19 | else if ( document.all ) | ||
20 | elem = eval( "document.all." + id ); | ||
21 | else | ||
22 | return false; | ||
23 | |||
24 | elemStyle = elem.style; | ||
25 | |||
26 | if ( elemStyle.display != "block" ) { | ||
27 | elemStyle.display = "block" | ||
28 | } else { | ||
29 | elemStyle.display = "none" | ||
30 | } | ||
31 | |||
32 | return true; | ||
33 | } | ||
34 | |||
35 | // Make codeblocks hidden by default | ||
36 | document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | ||
37 | |||
38 | // ]]> | ||
39 | </script> | ||
40 | |||
41 | </head> | ||
42 | <body> | ||
43 | |||
44 | |||
45 | <div id="classHeader"> | ||
46 | <table class="header-table"> | ||
47 | <tr class="top-aligned-row"> | ||
48 | <td><strong>Module</strong></td> | ||
49 | <td class="class-name-in-header">Mirimiri</td> | ||
50 | </tr> | ||
51 | <tr class="top-aligned-row"> | ||
52 | <td><strong>In:</strong></td> | ||
53 | <td> | ||
54 | |||
55 | |||
56 | <a href="../files/lib/mirimiri/string_rb.html"> | ||
57 | |||
58 | lib/mirimiri/string.rb | ||
59 | |||
60 | </a> | ||
61 | |||
62 | |||
63 | <br /> | ||
64 | |||
65 | |||
66 | <a href="../files/lib/mirimiri/document_rb.html"> | ||
67 | |||
68 | lib/mirimiri/document.rb | ||
69 | |||
70 | </a> | ||
71 | |||
72 | |||
73 | <br /> | ||
74 | |||
75 | </td> | ||
76 | </tr> | ||
77 | |||
78 | |||
79 | </table> | ||
80 | </div> | ||
81 | <!-- banner header --> | ||
82 | |||
83 | <div id="bodyContent"> | ||
84 | |||
85 | <div id="contextContent"> | ||
86 | |||
87 | <div id="description"> | ||
88 | <hr size="1"></hr><p> | ||
89 | General module | ||
90 | </p> | ||
91 | |||
92 | </div> | ||
93 | |||
94 | </div> | ||
95 | |||
96 | |||
97 | </div> | ||
98 | |||
99 | <!-- if includes --> | ||
100 | |||
101 | <div id="section"> | ||
102 | |||
103 | <div id="class-list"> | ||
104 | <h3 class="section-bar">Classes and Modules</h3> | ||
105 | |||
106 | Class <a href="Mirimiri/Document.html" class="link">Mirimiri::Document</a><br /> | ||
107 | Class <a href="Mirimiri/WebDocument.html" class="link">Mirimiri::WebDocument</a><br /> | ||
108 | Class <a href="Mirimiri/WikipediaPage.html" class="link">Mirimiri::WikipediaPage</a><br /> | ||
109 | |||
110 | </div> | ||
111 | |||
112 | <div id="constants-list"> | ||
113 | <h3 class="section-bar">Constants</h3> | ||
114 | |||
115 | <div class="name-list"> | ||
116 | <table summary="Constants"> | ||
117 | |||
118 | <tr class="top-aligned-row context-row"> | ||
119 | <td class="context-item-name">Stoplist</td> | ||
120 | <td>=</td> | ||
121 | <td class="context-item-value">[ "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu", "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during", "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting", "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff", "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore", "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he", "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto", "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto", "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include", "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into", "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last", "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe", "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs", "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless", "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing", "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite", "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt", "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave", "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them", "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts", "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru", "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh", "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward", "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week", "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore", "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever", "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom", "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within", "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your", "yours", "yourself", "yourselves" ]</td> | ||
122 | |||
123 | <td> </td> | ||
124 | <td class="context-item-desc"> | ||
125 | These are the default stopwords provided by Lemur. | ||
126 | |||
127 | </td> | ||
128 | |||
129 | </tr> | ||
130 | |||
131 | </table> | ||
132 | </div> | ||
133 | </div> | ||
134 | |||
135 | |||
136 | |||
137 | |||
138 | <!-- if method_list --> | ||
139 | |||
140 | |||
141 | |||
142 | |||
143 | </div> | ||
144 | |||
145 | <div id="validator-badges"> | ||
146 | <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | ||
147 | </div> | ||
148 | |||
149 | </body> | ||
150 | </html> | ||
151 |
doc/classes/Mirimiri/Document.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>Class: Mirimiri::Document [RDoc Documentation]</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <meta http-equiv="Content-Script-Type" content="text/javascript" /> | ||
8 | <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" /> | ||
9 | <script type="text/javascript"> | ||
10 | // <![CDATA[ | ||
11 | |||
12 | function popupCode( url ) { | ||
13 | window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | ||
14 | } | ||
15 | |||
16 | function toggleCode( id ) { | ||
17 | if ( document.getElementById ) | ||
18 | elem = document.getElementById( id ); | ||
19 | else if ( document.all ) | ||
20 | elem = eval( "document.all." + id ); | ||
21 | else | ||
22 | return false; | ||
23 | |||
24 | elemStyle = elem.style; | ||
25 | |||
26 | if ( elemStyle.display != "block" ) { | ||
27 | elemStyle.display = "block" | ||
28 | } else { | ||
29 | elemStyle.display = "none" | ||
30 | } | ||
31 | |||
32 | return true; | ||
33 | } | ||
34 | |||
35 | // Make codeblocks hidden by default | ||
36 | document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | ||
37 | |||
38 | // ]]> | ||
39 | </script> | ||
40 | |||
41 | </head> | ||
42 | <body> | ||
43 | |||
44 | |||
45 | <div id="classHeader"> | ||
46 | <table class="header-table"> | ||
47 | <tr class="top-aligned-row"> | ||
48 | <td><strong>Class</strong></td> | ||
49 | <td class="class-name-in-header">Mirimiri::Document</td> | ||
50 | </tr> | ||
51 | <tr class="top-aligned-row"> | ||
52 | <td><strong>In:</strong></td> | ||
53 | <td> | ||
54 | |||
55 | |||
56 | <a href="../../files/lib/mirimiri/document_rb.html"> | ||
57 | |||
58 | lib/mirimiri/document.rb | ||
59 | |||
60 | </a> | ||
61 | |||
62 | |||
63 | <br /> | ||
64 | |||
65 | </td> | ||
66 | </tr> | ||
67 | |||
68 | |||
69 | <tr class="top-aligned-row"> | ||
70 | <td><strong>Parent:</strong></td> | ||
71 | <td> | ||
72 | |||
73 | Object | ||
74 | |||
75 | </td> | ||
76 | </tr> | ||
77 | |||
78 | </table> | ||
79 | </div> | ||
80 | <!-- banner header --> | ||
81 | |||
82 | <div id="bodyContent"> | ||
83 | |||
84 | <div id="contextContent"> | ||
85 | |||
86 | <div id="description"> | ||
87 | <p> | ||
88 | A <a href="Document.html">Document</a> is a bag of words and is constructed | ||
89 | from a string. | ||
90 | </p> | ||
91 | |||
92 | </div> | ||
93 | |||
94 | </div> | ||
95 | |||
96 | |||
97 | <div id="method-list"> | ||
98 | <h3 class="section-bar">Methods</h3> | ||
99 | |||
100 | <div class="name-list"> | ||
101 | |||
102 | <a href="#M000024">count_words</a> | ||
103 | |||
104 | <a href="#M000025">entropy</a> | ||
105 | |||
106 | <a href="#M000022">format_words</a> | ||
107 | |||
108 | <a href="#M000027">new</a> | ||
109 | |||
110 | <a href="#M000023">ngrams</a> | ||
111 | |||
112 | <a href="#M000026">tf</a> | ||
113 | |||
114 | </div> | ||
115 | </div> | ||
116 | |||
117 | </div> | ||
118 | |||
119 | <!-- if includes --> | ||
120 | |||
121 | <div id="section"> | ||
122 | |||
123 | |||
124 | |||
125 | <div id="attribute-list"> | ||
126 | <h3 class="section-bar">Attributes</h3> | ||
127 | |||
128 | <div class="name-list"> | ||
129 | <table> | ||
130 | |||
131 | <tr class="top-aligned-row context-row"> | ||
132 | <td class="context-item-name">doc_content</td> | ||
133 | |||
134 | <td class="context-item-value"> [R] </td> | ||
135 | |||
136 | <td class="context-item-desc"></td> | ||
137 | </tr> | ||
138 | |||
139 | <tr class="top-aligned-row context-row"> | ||
140 | <td class="context-item-name">words</td> | ||
141 | |||
142 | <td class="context-item-value"> [R] </td> | ||
143 | |||
144 | <td class="context-item-desc"></td> | ||
145 | </tr> | ||
146 | |||
147 | </table> | ||
148 | </div> | ||
149 | </div> | ||
150 | |||
151 | |||
152 | <!-- if method_list --> | ||
153 | |||
154 | <div id="methods"> | ||
155 | |||
156 | <h3 class="section-bar">Public Class methods</h3> | ||
157 | |||
158 | |||
159 | <div id="method-M000027" class="method-detail"> | ||
160 | <a name="M000027"></a> | ||
161 | |||
162 | <div class="method-heading"> | ||
163 | |||
164 | <a href="Document.src/M000027.html" target="Code" class="method-signature" | ||
165 | onclick="popupCode('Document.src/M000027.html');return false;"> | ||
166 | |||
167 | <span class="method-name">new</span><span class="method-args">(content)</span> | ||
168 | |||
169 | </a> | ||
170 | |||
171 | </div> | ||
172 | |||
173 | <div class="method-description"> | ||
174 | |||
175 | </div> | ||
176 | </div> | ||
177 | |||
178 | |||
179 | <h3 class="section-bar">Public Instance methods</h3> | ||
180 | |||
181 | |||
182 | <div id="method-M000024" class="method-detail"> | ||
183 | <a name="M000024"></a> | ||
184 | |||
185 | <div class="method-heading"> | ||
186 | |||
187 | <a href="Document.src/M000024.html" target="Code" class="method-signature" | ||
188 | onclick="popupCode('Document.src/M000024.html');return false;"> | ||
189 | |||
190 | <span class="method-name">count_words</span><span class="method-args">()</span> | ||
191 | |||
192 | </a> | ||
193 | |||
194 | </div> | ||
195 | |||
196 | <div class="method-description"> | ||
197 | |||
198 | <p> | ||
199 | Returns a Hash containing the words and their associated counts in the | ||
200 | current <a href="Document.html">Document</a>. | ||
201 | </p> | ||
202 | <pre> | ||
203 | count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } | ||
204 | </pre> | ||
205 | |||
206 | </div> | ||
207 | </div> | ||
208 | |||
209 | |||
210 | <div id="method-M000025" class="method-detail"> | ||
211 | <a name="M000025"></a> | ||
212 | |||
213 | <div class="method-heading"> | ||
214 | |||
215 | <a href="Document.src/M000025.html" target="Code" class="method-signature" | ||
216 | onclick="popupCode('Document.src/M000025.html');return false;"> | ||
217 | |||
218 | <span class="method-name">entropy</span><span class="method-args">(s)</span> | ||
219 | |||
220 | </a> | ||
221 | |||
222 | </div> | ||
223 | |||
224 | <div class="method-description"> | ||
225 | |||
226 | <p> | ||
227 | Computes the entropy of a given string <tt>s</tt> inside the document. | ||
228 | </p> | ||
229 | <p> | ||
230 | If the string parameter is composed of many words (i.e. tokens separated by | ||
231 | whitespace(s)), it is considered as an ngram. | ||
232 | </p> | ||
233 | <pre> | ||
234 | entropy("guitar") #=> 0.00432114812727959 | ||
235 | entropy("dillinger escape plan") #=> 0.265862076325102 | ||
236 | </pre> | ||
237 | |||
238 | </div> | ||
239 | </div> | ||
240 | |||
241 | |||
242 | <div id="method-M000023" class="method-detail"> | ||
243 | <a name="M000023"></a> | ||
244 | |||
245 | <div class="method-heading"> | ||
246 | |||
247 | <a href="Document.src/M000023.html" target="Code" class="method-signature" | ||
248 | onclick="popupCode('Document.src/M000023.html');return false;"> | ||
249 | |||
250 | <span class="method-name">ngrams</span><span class="method-args">(n)</span> | ||
251 | |||
252 | </a> | ||
253 | |||
254 | </div> | ||
255 | |||
256 | <div class="method-description"> | ||
257 | |||
258 | <p> | ||
259 | Returns an Array containing the <tt>n</tt>-grams (words) from the current | ||
260 | <a href="Document.html">Document</a>. | ||
261 | </p> | ||
262 | <pre> | ||
263 | ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] | ||
264 | </pre> | ||
265 | |||
266 | </div> | ||
267 | </div> | ||
268 | |||
269 | |||
270 | <div id="method-M000026" class="method-detail"> | ||
271 | <a name="M000026"></a> | ||
272 | |||
273 | <div class="method-heading"> | ||
274 | |||
275 | <a href="Document.src/M000026.html" target="Code" class="method-signature" | ||
276 | onclick="popupCode('Document.src/M000026.html');return false;"> | ||
277 | |||
278 | <span class="method-name">tf</span><span class="method-args">(s)</span> | ||
279 | |||
280 | </a> | ||
281 | |||
282 | </div> | ||
283 | |||
284 | <div class="method-description"> | ||
285 | |||
286 | <p> | ||
287 | Computes the term frequency of a given <b>word</b> <tt>s</tt>. | ||
288 | </p> | ||
289 | <pre> | ||
290 | tf("guitar") #=> 0.000380372765310004 | ||
291 | </pre> | ||
292 | |||
293 | </div> | ||
294 | </div> | ||
295 | |||
296 | |||
297 | <h3 class="section-bar">Protected Instance methods</h3> | ||
298 | |||
299 | |||
300 | <div id="method-M000022" class="method-detail"> | ||
301 | <a name="M000022"></a> | ||
302 | |||
303 | <div class="method-heading"> | ||
304 | |||
305 | <a href="Document.src/M000022.html" target="Code" class="method-signature" | ||
306 | onclick="popupCode('Document.src/M000022.html');return false;"> | ||
307 | |||
308 | <span class="method-name">format_words</span><span class="method-args">()</span> | ||
309 | |||
310 | </a> | ||
311 | |||
312 | </div> | ||
313 | |||
314 | <div class="method-description"> | ||
315 | |||
316 | <p> | ||
317 | Any non-word characters are removed from the words (see <a | ||
318 | href="http://perldoc.perl.org/perlre.html">perldoc.perl.org/perlre.html</a> | ||
319 | and the W special escape). | ||
320 | </p> | ||
321 | <p> | ||
322 | Protected function, only meant to by called at the initialization. | ||
323 | </p> | ||
324 | |||
325 | </div> | ||
326 | </div> | ||
327 | |||
328 | |||
329 | |||
330 | </div> | ||
331 | |||
332 | |||
333 | |||
334 | |||
335 | </div> | ||
336 | |||
337 | <div id="validator-badges"> | ||
338 | <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | ||
339 | </div> | ||
340 | |||
341 | </body> | ||
342 | </html> | ||
343 |
doc/classes/Mirimiri/Document.src/M000022.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>format_words (Mirimiri::Document)</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | ||
8 | </head> | ||
9 | <body class="standalone-code"> | ||
10 | <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 34</span> | ||
11 | <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">format_words</span> | ||
12 | <span class="ruby-identifier">wo</span> = [] | ||
13 | |||
14 | <span class="ruby-ivar">@doc_content</span>.<span class="ruby-identifier">split</span>.<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span> | ||
15 | <span class="ruby-identifier">w</span>.<span class="ruby-identifier">split</span>(<span class="ruby-regexp re">/\W/</span>).<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">sw</span><span class="ruby-operator">|</span> | ||
16 | <span class="ruby-identifier">wo</span>.<span class="ruby-identifier">push</span>(<span class="ruby-identifier">sw</span>.<span class="ruby-identifier">downcase</span>) <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">sw</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/[a-zA-Z]/</span> | ||
17 | <span class="ruby-keyword kw">end</span> | ||
18 | <span class="ruby-keyword kw">end</span> | ||
19 | |||
20 | <span class="ruby-identifier">wo</span> | ||
21 | <span class="ruby-keyword kw">end</span></pre> | ||
22 | </body> | ||
23 | </html> | ||
24 |
doc/classes/Mirimiri/Document.src/M000023.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>ngrams (Mirimiri::Document)</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | ||
8 | </head> | ||
9 | <body class="standalone-code"> | ||
10 | <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 49</span> | ||
11 | <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">ngrams</span>(<span class="ruby-identifier">n</span>) | ||
12 | <span class="ruby-identifier">window</span> = [] | ||
13 | <span class="ruby-identifier">ngrams_array</span> = [] | ||
14 | |||
15 | <span class="ruby-ivar">@words</span>.<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span> | ||
16 | <span class="ruby-identifier">window</span>.<span class="ruby-identifier">push</span>(<span class="ruby-identifier">w</span>) | ||
17 | <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">window</span>.<span class="ruby-identifier">size</span> <span class="ruby-operator">==</span> <span class="ruby-identifier">n</span> | ||
18 | <span class="ruby-identifier">ngrams_array</span>.<span class="ruby-identifier">push</span> <span class="ruby-identifier">window</span>.<span class="ruby-identifier">join</span>(<span class="ruby-value str">" "</span>) | ||
19 | <span class="ruby-identifier">window</span>.<span class="ruby-identifier">delete_at</span>(<span class="ruby-value">0</span>) | ||
20 | <span class="ruby-keyword kw">end</span> | ||
21 | <span class="ruby-keyword kw">end</span> | ||
22 | |||
23 | <span class="ruby-identifier">ngrams_array</span>.<span class="ruby-identifier">uniq</span> | ||
24 | <span class="ruby-keyword kw">end</span></pre> | ||
25 | </body> | ||
26 | </html> | ||
27 |
doc/classes/Mirimiri/Document.src/M000024.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>count_words (Mirimiri::Document)</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | ||
8 | </head> | ||
9 | <body class="standalone-code"> | ||
10 | <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 67</span> | ||
11 | <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">count_words</span> | ||
12 | <span class="ruby-identifier">counts</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">h</span>,<span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">h</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-value">0</span> } | ||
13 | <span class="ruby-ivar">@words</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span> <span class="ruby-identifier">counts</span>[<span class="ruby-identifier">w</span>] <span class="ruby-operator">+=</span> <span class="ruby-value">1</span> } | ||
14 | |||
15 | <span class="ruby-identifier">counts</span> | ||
16 | <span class="ruby-keyword kw">end</span></pre> | ||
17 | </body> | ||
18 | </html> | ||
19 |
doc/classes/Mirimiri/Document.src/M000025.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>entropy (Mirimiri::Document)</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | ||
8 | </head> | ||
9 | <body class="standalone-code"> | ||
10 | <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 81</span> | ||
11 | <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">entropy</span>(<span class="ruby-identifier">s</span>) | ||
12 | <span class="ruby-identifier">en</span> = <span class="ruby-value">0</span><span class="ruby-value">.0</span> | ||
13 | <span class="ruby-identifier">counts</span> = <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">count_words</span> | ||
14 | |||
15 | <span class="ruby-identifier">s</span>.<span class="ruby-identifier">split</span>.<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span> | ||
16 | <span class="ruby-identifier">p_wi</span> = <span class="ruby-identifier">counts</span>[<span class="ruby-identifier">w</span>].<span class="ruby-identifier">to_f</span><span class="ruby-operator">/</span><span class="ruby-ivar">@words</span>.<span class="ruby-identifier">count</span>.<span class="ruby-identifier">to_f</span> | ||
17 | <span class="ruby-identifier">en</span> <span class="ruby-operator">+=</span> <span class="ruby-identifier">p_wi</span><span class="ruby-operator">*</span><span class="ruby-constant">Math</span>.<span class="ruby-identifier">log2</span>(<span class="ruby-identifier">p_wi</span>) | ||
18 | <span class="ruby-keyword kw">end</span> | ||
19 | |||
20 | <span class="ruby-identifier">en</span> <span class="ruby-operator">*=</span> <span class="ruby-value">-1</span> | ||
21 | <span class="ruby-identifier">en</span> | ||
22 | <span class="ruby-keyword kw">end</span></pre> | ||
23 | </body> | ||
24 | </html> | ||
25 |
doc/classes/Mirimiri/Document.src/M000026.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>tf (Mirimiri::Document)</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | ||
8 | </head> | ||
9 | <body class="standalone-code"> | ||
10 | <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 97</span> | ||
11 | <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">tf</span>(<span class="ruby-identifier">s</span>) | ||
12 | <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">count_words</span>[<span class="ruby-identifier">s</span>].<span class="ruby-identifier">to_f</span><span class="ruby-operator">/</span><span class="ruby-ivar">@words</span>.<span class="ruby-identifier">size</span>.<span class="ruby-identifier">to_f</span> | ||
13 | <span class="ruby-keyword kw">end</span></pre> | ||
14 | </body> | ||
15 | </html> | ||
16 |
doc/classes/Mirimiri/Document.src/M000027.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>new (Mirimiri::Document)</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | ||
8 | </head> | ||
9 | <body class="standalone-code"> | ||
10 | <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 102</span> | ||
11 | <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">content</span>) | ||
12 | <span class="ruby-ivar">@doc_content</span> = <span class="ruby-identifier">content</span> | ||
13 | <span class="ruby-ivar">@words</span> = <span class="ruby-identifier">format_words</span> | ||
14 | <span class="ruby-keyword kw">end</span></pre> | ||
15 | </body> | ||
16 | </html> | ||
17 |
doc/classes/Mirimiri/WebDocument.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>Class: Mirimiri::WebDocument [RDoc Documentation]</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <meta http-equiv="Content-Script-Type" content="text/javascript" /> | ||
8 | <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" /> | ||
9 | <script type="text/javascript"> | ||
10 | // <![CDATA[ | ||
11 | |||
12 | function popupCode( url ) { | ||
13 | window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | ||
14 | } | ||
15 | |||
16 | function toggleCode( id ) { | ||
17 | if ( document.getElementById ) | ||
18 | elem = document.getElementById( id ); | ||
19 | else if ( document.all ) | ||
20 | elem = eval( "document.all." + id ); | ||
21 | else | ||
22 | return false; | ||
23 | |||
24 | elemStyle = elem.style; | ||
25 | |||
26 | if ( elemStyle.display != "block" ) { | ||
27 | elemStyle.display = "block" | ||
28 | } else { | ||
29 | elemStyle.display = "none" | ||
30 | } | ||
31 | |||
32 | return true; | ||
33 | } | ||
34 | |||
35 | // Make codeblocks hidden by default | ||
36 | document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | ||
37 | |||
38 | // ]]> | ||
39 | </script> | ||
40 | |||
41 | </head> | ||
42 | <body> | ||
43 | |||
44 | |||
45 | <div id="classHeader"> | ||
46 | <table class="header-table"> | ||
47 | <tr class="top-aligned-row"> | ||
48 | <td><strong>Class</strong></td> | ||
49 | <td class="class-name-in-header">Mirimiri::WebDocument</td> | ||
50 | </tr> | ||
51 | <tr class="top-aligned-row"> | ||
52 | <td><strong>In:</strong></td> | ||
53 | <td> | ||
54 | |||
55 | |||
56 | <a href="../../files/lib/mirimiri/document_rb.html"> | ||
57 | |||
58 | lib/mirimiri/document.rb | ||
59 | |||
60 | </a> | ||
61 | |||
62 | |||
63 | <br /> | ||
64 | |||
65 | </td> | ||
66 | </tr> | ||
67 | |||
68 | |||
69 | <tr class="top-aligned-row"> | ||
70 | <td><strong>Parent:</strong></td> | ||
71 | <td> | ||
72 | |||
73 | <a href="Document.html"> | ||
74 | |||
75 | Mirimiri::Document | ||
76 | |||
77 | </a> | ||
78 | |||
79 | </td> | ||
80 | </tr> | ||
81 | |||
82 | </table> | ||
83 | </div> | ||
84 | <!-- banner header --> | ||
85 | |||
86 | <div id="bodyContent"> | ||
87 | |||
88 | <div id="contextContent"> | ||
89 | |||
90 | <div id="description"> | ||
91 | <p> | ||
92 | A <a href="WebDocument.html">WebDocument</a> is a <a | ||
93 | href="Document.html">Document</a> with a <tt>url</tt>. | ||
94 | </p> | ||
95 | |||
96 | </div> | ||
97 | |||
98 | </div> | ||
99 | |||
100 | |||
101 | <div id="method-list"> | ||
102 | <h3 class="section-bar">Methods</h3> | ||
103 | |||
104 | <div class="name-list"> | ||
105 | |||
106 | <a href="#M000028">get_content</a> | ||
107 | |||
108 | <a href="#M000029">new</a> | ||
109 | |||
110 | </div> | ||
111 | </div> | ||
112 | |||
113 | </div> | ||
114 | |||
115 | <!-- if includes --> | ||
116 | |||
117 | <div id="section"> | ||
118 | |||
119 | |||
120 | |||
121 | <div id="attribute-list"> | ||
122 | <h3 class="section-bar">Attributes</h3> | ||
123 | |||
124 | <div class="name-list"> | ||
125 | <table> | ||
126 | |||
127 | <tr class="top-aligned-row context-row"> | ||
128 | <td class="context-item-name">url</td> | ||
129 | |||
130 | <td class="context-item-value"> [R] </td> | ||
131 | |||
132 | <td class="context-item-desc"></td> | ||
133 | </tr> | ||
134 | |||
135 | </table> | ||
136 | </div> | ||
137 | </div> | ||
138 | |||
139 | |||
140 | <!-- if method_list --> | ||
141 | |||
142 | <div id="methods"> | ||
143 | |||
144 | <h3 class="section-bar">Public Class methods</h3> | ||
145 | |||
146 | |||
147 | <div id="method-M000028" class="method-detail"> | ||
148 | <a name="M000028"></a> | ||
149 | |||
150 | <div class="method-heading"> | ||
151 | |||
152 | <a href="WebDocument.src/M000028.html" target="Code" class="method-signature" | ||
153 | onclick="popupCode('WebDocument.src/M000028.html');return false;"> | ||
154 | |||
155 | <span class="method-name">get_content</span><span class="method-args">(url)</span> | ||
156 | |||
157 | </a> | ||
158 | |||
159 | </div> | ||
160 | |||
161 | <div class="method-description"> | ||
162 | |||
163 | <p> | ||
164 | Returns the HTML text from the page of a given <tt>url</tt>. | ||
165 | </p> | ||
166 | |||
167 | </div> | ||
168 | </div> | ||
169 | |||
170 | |||
171 | <div id="method-M000029" class="method-detail"> | ||
172 | <a name="M000029"></a> | ||
173 | |||
174 | <div class="method-heading"> | ||
175 | |||
176 | <a href="WebDocument.src/M000029.html" target="Code" class="method-signature" | ||
177 | onclick="popupCode('WebDocument.src/M000029.html');return false;"> | ||
178 | |||
179 | <span class="method-name">new</span><span class="method-args">(url)</span> | ||
180 | |||
181 | </a> | ||
182 | |||
183 | </div> | ||
184 | |||
185 | <div class="method-description"> | ||
186 | |||
187 | <p> | ||
188 | <a href="WebDocument.html">WebDocument</a> constructor, the content of the | ||
189 | <a href="Document.html">Document</a> is the HTML page without the tags. | ||
190 | </p> | ||
191 | |||
192 | </div> | ||
193 | </div> | ||
194 | |||
195 | |||
196 | |||
197 | </div> | ||
198 | |||
199 | |||
200 | |||
201 | |||
202 | </div> | ||
203 | |||
204 | <div id="validator-badges"> | ||
205 | <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | ||
206 | </div> | ||
207 | |||
208 | </body> | ||
209 | </html> | ||
210 |
doc/classes/Mirimiri/WebDocument.src/M000028.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>get_content (Mirimiri::WebDocument)</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | ||
8 | </head> | ||
9 | <body class="standalone-code"> | ||
10 | <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 115</span> | ||
11 | <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">get_content</span>(<span class="ruby-identifier">url</span>) | ||
12 | <span class="ruby-identifier">require</span> <span class="ruby-value str">'net/http'</span> | ||
13 | <span class="ruby-constant">Net</span><span class="ruby-operator">::</span><span class="ruby-constant">HTTP</span>.<span class="ruby-identifier">get</span>(<span class="ruby-constant">URI</span>.<span class="ruby-identifier">parse</span>(<span class="ruby-identifier">url</span>)) | ||
14 | <span class="ruby-keyword kw">end</span></pre> | ||
15 | </body> | ||
16 | </html> | ||
17 |
doc/classes/Mirimiri/WebDocument.src/M000029.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>new (Mirimiri::WebDocument)</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | ||
8 | </head> | ||
9 | <body class="standalone-code"> | ||
10 | <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 122</span> | ||
11 | <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">url</span>) | ||
12 | <span class="ruby-ivar">@url</span> = <span class="ruby-identifier">url</span> | ||
13 | <span class="ruby-keyword kw">super</span> <span class="ruby-constant">WebDocument</span>.<span class="ruby-identifier">get_content</span>(<span class="ruby-identifier">url</span>).<span class="ruby-identifier">strip_javascripts</span>.<span class="ruby-identifier">strip_stylesheets</span>.<span class="ruby-identifier">strip_xml_tags</span> | ||
14 | <span class="ruby-keyword kw">end</span></pre> | ||
15 | </body> | ||
16 | </html> | ||
17 |
doc/classes/Mirimiri/WikipediaPage.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>Class: Mirimiri::WikipediaPage [RDoc Documentation]</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <meta http-equiv="Content-Script-Type" content="text/javascript" /> | ||
8 | <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" /> | ||
9 | <script type="text/javascript"> | ||
10 | // <![CDATA[ | ||
11 | |||
12 | function popupCode( url ) { | ||
13 | window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | ||
14 | } | ||
15 | |||
16 | function toggleCode( id ) { | ||
17 | if ( document.getElementById ) | ||
18 | elem = document.getElementById( id ); | ||
19 | else if ( document.all ) | ||
20 | elem = eval( "document.all." + id ); | ||
21 | else | ||
22 | return false; | ||
23 | |||
24 | elemStyle = elem.style; | ||
25 | |||
26 | if ( elemStyle.display != "block" ) { | ||
27 | elemStyle.display = "block" | ||
28 | } else { | ||
29 | elemStyle.display = "none" | ||
30 | } | ||
31 | |||
32 | return true; | ||
33 | } | ||
34 | |||
35 | // Make codeblocks hidden by default | ||
36 | document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | ||
37 | |||
38 | // ]]> | ||
39 | </script> | ||
40 | |||
41 | </head> | ||
42 | <body> | ||
43 | |||
44 | |||
45 | <div id="classHeader"> | ||
46 | <table class="header-table"> | ||
47 | <tr class="top-aligned-row"> | ||
48 | <td><strong>Class</strong></td> | ||
49 | <td class="class-name-in-header">Mirimiri::WikipediaPage</td> | ||
50 | </tr> | ||
51 | <tr class="top-aligned-row"> | ||
52 | <td><strong>In:</strong></td> | ||
53 | <td> | ||
54 | |||
55 | |||
56 | <a href="../../files/lib/mirimiri/document_rb.html"> | ||
57 | |||
58 | lib/mirimiri/document.rb | ||
59 | |||
60 | </a> | ||
61 | |||
62 | |||
63 | <br /> | ||
64 | |||
65 | </td> | ||
66 | </tr> | ||
67 | |||
68 | |||
69 | <tr class="top-aligned-row"> | ||
70 | <td><strong>Parent:</strong></td> | ||
71 | <td> | ||
72 | |||
73 | <a href="WebDocument.html"> | ||
74 | |||
75 | Mirimiri::WebDocument | ||
76 | |||
77 | </a> | ||
78 | |||
79 | </td> | ||
80 | </tr> | ||
81 | |||
82 | </table> | ||
83 | </div> | ||
84 | <!-- banner header --> | ||
85 | |||
86 | <div id="bodyContent"> | ||
87 | |||
88 | <div id="contextContent"> | ||
89 | |||
90 | <div id="description"> | ||
91 | <p> | ||
92 | A <a href="WikipediaPage.html">WikipediaPage</a> is a <a | ||
93 | href="WebDocument.html">WebDocument</a>. | ||
94 | </p> | ||
95 | |||
96 | </div> | ||
97 | |||
98 | </div> | ||
99 | |||
100 | |||
101 | <div id="method-list"> | ||
102 | <h3 class="section-bar">Methods</h3> | ||
103 | |||
104 | <div class="name-list"> | ||
105 | |||
106 | <a href="#M000031">get_url</a> | ||
107 | |||
108 | <a href="#M000032">search_homepage</a> | ||
109 | |||
110 | <a href="#M000030">search_wikipedia_titles</a> | ||
111 | |||
112 | </div> | ||
113 | </div> | ||
114 | |||
115 | </div> | ||
116 | |||
117 | <!-- if includes --> | ||
118 | |||
119 | <div id="section"> | ||
120 | |||
121 | |||
122 | |||
123 | |||
124 | <!-- if method_list --> | ||
125 | |||
126 | <div id="methods"> | ||
127 | |||
128 | <h3 class="section-bar">Public Class methods</h3> | ||
129 | |||
130 | |||
131 | <div id="method-M000031" class="method-detail"> | ||
132 | <a name="M000031"></a> | ||
133 | |||
134 | <div class="method-heading"> | ||
135 | |||
136 | <a href="WikipediaPage.src/M000031.html" target="Code" class="method-signature" | ||
137 | onclick="popupCode('WikipediaPage.src/M000031.html');return false;"> | ||
138 | |||
139 | <span class="method-name">get_url</span><span class="method-args">(name)</span> | ||
140 | |||
141 | </a> | ||
142 | |||
143 | </div> | ||
144 | |||
145 | <div class="method-description"> | ||
146 | |||
147 | </div> | ||
148 | </div> | ||
149 | |||
150 | |||
151 | <div id="method-M000032" class="method-detail"> | ||
152 | <a name="M000032"></a> | ||
153 | |||
154 | <div class="method-heading"> | ||
155 | |||
156 | <a href="WikipediaPage.src/M000032.html" target="Code" class="method-signature" | ||
157 | onclick="popupCode('WikipediaPage.src/M000032.html');return false;"> | ||
158 | |||
159 | <span class="method-name">search_homepage</span><span class="method-args">(name)</span> | ||
160 | |||
161 | </a> | ||
162 | |||
163 | </div> | ||
164 | |||
165 | <div class="method-description"> | ||
166 | |||
167 | </div> | ||
168 | </div> | ||
169 | |||
170 | |||
171 | <div id="method-M000030" class="method-detail"> | ||
172 | <a name="M000030"></a> | ||
173 | |||
174 | <div class="method-heading"> | ||
175 | |||
176 | <a href="WikipediaPage.src/M000030.html" target="Code" class="method-signature" | ||
177 | onclick="popupCode('WikipediaPage.src/M000030.html');return false;"> | ||
178 | |||
179 | <span class="method-name">search_wikipedia_titles</span><span class="method-args">(name)</span> | ||
180 | |||
181 | </a> | ||
182 | |||
183 | </div> | ||
184 | |||
185 | <div class="method-description"> | ||
186 | |||
187 | </div> | ||
188 | </div> | ||
189 | |||
190 | |||
191 | |||
192 | </div> | ||
193 | |||
194 | |||
195 | |||
196 | |||
197 | </div> | ||
198 | |||
199 | <div id="validator-badges"> | ||
200 | <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | ||
201 | </div> | ||
202 | |||
203 | </body> | ||
204 | </html> | ||
205 |
doc/classes/Mirimiri/WikipediaPage.src/M000030.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>search_wikipedia_titles (Mirimiri::WikipediaPage)</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | ||
8 | </head> | ||
9 | <body class="standalone-code"> | ||
10 | <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 135</span> | ||
11 | <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">search_wikipedia_titles</span>(<span class="ruby-identifier">name</span>) | ||
12 | <span class="ruby-identifier">raise</span> <span class="ruby-constant">ArgumentError</span>, <span class="ruby-value str">"Bad encoding"</span>, <span class="ruby-identifier">name</span> <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">name</span>.<span class="ruby-identifier">isutf8</span> | ||
13 | |||
14 | <span class="ruby-identifier">res</span> = <span class="ruby-constant">REXML</span><span class="ruby-operator">::</span><span class="ruby-constant">Document</span>.<span class="ruby-identifier">new</span>(<span class="ruby-constant">Net</span><span class="ruby-operator">::</span><span class="ruby-constant">HTTP</span>.<span class="ruby-identifier">get</span>( <span class="ruby-constant">URI</span>.<span class="ruby-identifier">parse</span> <span class="ruby-node">"http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml"</span> ).<span class="ruby-identifier">toutf8</span>).<span class="ruby-identifier">elements</span>[<span class="ruby-value str">'api/query/search'</span>] | ||
15 | |||
16 | <span class="ruby-identifier">res</span>.<span class="ruby-identifier">collect</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">e</span><span class="ruby-operator">|</span> <span class="ruby-identifier">e</span>.<span class="ruby-identifier">attributes</span>[<span class="ruby-value str">'title'</span>] } <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">res</span>.<span class="ruby-identifier">nil?</span> | ||
17 | <span class="ruby-keyword kw">end</span></pre> | ||
18 | </body> | ||
19 | </html> | ||
20 |
doc/classes/Mirimiri/WikipediaPage.src/M000031.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>get_url (Mirimiri::WikipediaPage)</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | ||
8 | </head> | ||
9 | <body class="standalone-code"> | ||
10 | <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 143</span> | ||
11 | <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">get_url</span>(<span class="ruby-identifier">name</span>) | ||
12 | <span class="ruby-identifier">raise</span> <span class="ruby-constant">ArgumentError</span>, <span class="ruby-value str">"Bad encoding"</span>, <span class="ruby-identifier">name</span> <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">name</span>.<span class="ruby-identifier">isutf8</span> | ||
13 | |||
14 | <span class="ruby-identifier">atts</span> = <span class="ruby-constant">REXML</span><span class="ruby-operator">::</span><span class="ruby-constant">Document</span>.<span class="ruby-identifier">new</span>(<span class="ruby-constant">Net</span><span class="ruby-operator">::</span><span class="ruby-constant">HTTP</span>.<span class="ruby-identifier">get</span>( <span class="ruby-constant">URI</span>.<span class="ruby-identifier">parse</span> <span class="ruby-node">"http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml"</span> ).<span class="ruby-identifier">toutf8</span>).<span class="ruby-identifier">elements</span>[<span class="ruby-value str">'api/query/pages/page'</span>].<span class="ruby-identifier">attributes</span> | ||
15 | |||
16 | <span class="ruby-identifier">atts</span>[<span class="ruby-value str">'fullurl'</span>] <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">atts</span>[<span class="ruby-value str">'missing'</span>].<span class="ruby-identifier">nil?</span> | ||
17 | <span class="ruby-keyword kw">end</span></pre> | ||
18 | </body> | ||
19 | </html> | ||
20 |
doc/classes/Mirimiri/WikipediaPage.src/M000032.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>search_homepage (Mirimiri::WikipediaPage)</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | ||
8 | </head> | ||
9 | <body class="standalone-code"> | ||
10 | <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 151</span> | ||
11 | <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">search_homepage</span>(<span class="ruby-identifier">name</span>) | ||
12 | <span class="ruby-identifier">title</span> = <span class="ruby-constant">WikipediaPage</span>.<span class="ruby-identifier">search_wikipedia_titles</span> <span class="ruby-identifier">name</span> | ||
13 | |||
14 | <span class="ruby-constant">WikipediaPage</span>.<span class="ruby-identifier">new</span>(<span class="ruby-constant">WikipediaPage</span>.<span class="ruby-identifier">get_url</span> <span class="ruby-identifier">title</span>[<span class="ruby-value">0</span>]) <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">title</span>.<span class="ruby-identifier">nil?</span> <span class="ruby-operator">||</span> <span class="ruby-identifier">title</span>.<span class="ruby-identifier">empty?</span> | ||
15 | <span class="ruby-keyword kw">end</span></pre> | ||
16 | </body> | ||
17 | </html> | ||
18 |
doc/files/lib/mirimiri/corpus_rb.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>File: corpus.rb [RDoc Documentation]</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <meta http-equiv="Content-Script-Type" content="text/javascript" /> | ||
8 | <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | ||
9 | <script type="text/javascript"> | ||
10 | // <![CDATA[ | ||
11 | |||
12 | function popupCode( url ) { | ||
13 | window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | ||
14 | } | ||
15 | |||
16 | function toggleCode( id ) { | ||
17 | if ( document.getElementById ) | ||
18 | elem = document.getElementById( id ); | ||
19 | else if ( document.all ) | ||
20 | elem = eval( "document.all." + id ); | ||
21 | else | ||
22 | return false; | ||
23 | |||
24 | elemStyle = elem.style; | ||
25 | |||
26 | if ( elemStyle.display != "block" ) { | ||
27 | elemStyle.display = "block" | ||
28 | } else { | ||
29 | elemStyle.display = "none" | ||
30 | } | ||
31 | |||
32 | return true; | ||
33 | } | ||
34 | |||
35 | // Make codeblocks hidden by default | ||
36 | document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | ||
37 | |||
38 | // ]]> | ||
39 | </script> | ||
40 | |||
41 | </head> | ||
42 | <body> | ||
43 | |||
44 | |||
45 | <div id="fileHeader"> | ||
46 | <h1>corpus.rb</h1> | ||
47 | <table class="header-table"> | ||
48 | <tr class="top-aligned-row"> | ||
49 | <td><strong>Path:</strong></td> | ||
50 | <td>lib/mirimiri/corpus.rb | ||
51 | |||
52 | </td> | ||
53 | </tr> | ||
54 | <tr class="top-aligned-row"> | ||
55 | <td><strong>Last Update:</strong></td> | ||
56 | <td>2010-12-20 10:35:26 +0100</td> | ||
57 | </tr> | ||
58 | </table> | ||
59 | </div> | ||
60 | <!-- banner header --> | ||
61 | |||
62 | <div id="bodyContent"> | ||
63 | |||
64 | <div id="contextContent"> | ||
65 | |||
66 | </div> | ||
67 | |||
68 | |||
69 | </div> | ||
70 | |||
71 | <!-- if includes --> | ||
72 | |||
73 | <div id="section"> | ||
74 | |||
75 | |||
76 | |||
77 | |||
78 | <!-- if method_list --> | ||
79 | |||
80 | |||
81 | |||
82 | |||
83 | </div> | ||
84 | |||
85 | <div id="validator-badges"> | ||
86 | <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | ||
87 | </div> | ||
88 | |||
89 | </body> | ||
90 | </html> | ||
91 |
doc/files/lib/mirimiri/document_rb.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>File: document.rb [RDoc Documentation]</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <meta http-equiv="Content-Script-Type" content="text/javascript" /> | ||
8 | <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | ||
9 | <script type="text/javascript"> | ||
10 | // <![CDATA[ | ||
11 | |||
12 | function popupCode( url ) { | ||
13 | window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | ||
14 | } | ||
15 | |||
16 | function toggleCode( id ) { | ||
17 | if ( document.getElementById ) | ||
18 | elem = document.getElementById( id ); | ||
19 | else if ( document.all ) | ||
20 | elem = eval( "document.all." + id ); | ||
21 | else | ||
22 | return false; | ||
23 | |||
24 | elemStyle = elem.style; | ||
25 | |||
26 | if ( elemStyle.display != "block" ) { | ||
27 | elemStyle.display = "block" | ||
28 | } else { | ||
29 | elemStyle.display = "none" | ||
30 | } | ||
31 | |||
32 | return true; | ||
33 | } | ||
34 | |||
35 | // Make codeblocks hidden by default | ||
36 | document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | ||
37 | |||
38 | // ]]> | ||
39 | </script> | ||
40 | |||
41 | </head> | ||
42 | <body> | ||
43 | |||
44 | |||
45 | <div id="fileHeader"> | ||
46 | <h1>document.rb</h1> | ||
47 | <table class="header-table"> | ||
48 | <tr class="top-aligned-row"> | ||
49 | <td><strong>Path:</strong></td> | ||
50 | <td>lib/mirimiri/document.rb | ||
51 | |||
52 | </td> | ||
53 | </tr> | ||
54 | <tr class="top-aligned-row"> | ||
55 | <td><strong>Last Update:</strong></td> | ||
56 | <td>2010-12-20 10:36:07 +0100</td> | ||
57 | </tr> | ||
58 | </table> | ||
59 | </div> | ||
60 | <!-- banner header --> | ||
61 | |||
62 | <div id="bodyContent"> | ||
63 | |||
64 | <div id="contextContent"> | ||
65 | |||
66 | <div id="requires-list"> | ||
67 | <h3 class="section-bar">Required files</h3> | ||
68 | |||
69 | <div class="name-list"> | ||
70 | |||
71 | net/http | ||
72 | |||
73 | rexml/document | ||
74 | |||
75 | net/http | ||
76 | |||
77 | kconv | ||
78 | |||
79 | </div> | ||
80 | </div> | ||
81 | |||
82 | </div> | ||
83 | |||
84 | |||
85 | </div> | ||
86 | |||
87 | <!-- if includes --> | ||
88 | |||
89 | <div id="section"> | ||
90 | |||
91 | |||
92 | |||
93 | |||
94 | <!-- if method_list --> | ||
95 | |||
96 | |||
97 | |||
98 | |||
99 | </div> | ||
100 | |||
101 | <div id="validator-badges"> | ||
102 | <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | ||
103 | </div> | ||
104 | |||
105 | </body> | ||
106 | </html> | ||
107 |
doc/files/lib/mirimiri/query_rb.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>File: query.rb [RDoc Documentation]</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <meta http-equiv="Content-Script-Type" content="text/javascript" /> | ||
8 | <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | ||
9 | <script type="text/javascript"> | ||
10 | // <![CDATA[ | ||
11 | |||
12 | function popupCode( url ) { | ||
13 | window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | ||
14 | } | ||
15 | |||
16 | function toggleCode( id ) { | ||
17 | if ( document.getElementById ) | ||
18 | elem = document.getElementById( id ); | ||
19 | else if ( document.all ) | ||
20 | elem = eval( "document.all." + id ); | ||
21 | else | ||
22 | return false; | ||
23 | |||
24 | elemStyle = elem.style; | ||
25 | |||
26 | if ( elemStyle.display != "block" ) { | ||
27 | elemStyle.display = "block" | ||
28 | } else { | ||
29 | elemStyle.display = "none" | ||
30 | } | ||
31 | |||
32 | return true; | ||
33 | } | ||
34 | |||
35 | // Make codeblocks hidden by default | ||
36 | document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | ||
37 | |||
38 | // ]]> | ||
39 | </script> | ||
40 | |||
41 | </head> | ||
42 | <body> | ||
43 | |||
44 | |||
45 | <div id="fileHeader"> | ||
46 | <h1>query.rb</h1> | ||
47 | <table class="header-table"> | ||
48 | <tr class="top-aligned-row"> | ||
49 | <td><strong>Path:</strong></td> | ||
50 | <td>lib/mirimiri/query.rb | ||
51 | |||
52 | </td> | ||
53 | </tr> | ||
54 | <tr class="top-aligned-row"> | ||
55 | <td><strong>Last Update:</strong></td> | ||
56 | <td>2010-12-20 10:36:27 +0100</td> | ||
57 | </tr> | ||
58 | </table> | ||
59 | </div> | ||
60 | <!-- banner header --> | ||
61 | |||
62 | <div id="bodyContent"> | ||
63 | |||
64 | <div id="contextContent"> | ||
65 | |||
66 | </div> | ||
67 | |||
68 | |||
69 | </div> | ||
70 | |||
71 | <!-- if includes --> | ||
72 | |||
73 | <div id="section"> | ||
74 | |||
75 | |||
76 | |||
77 | |||
78 | <!-- if method_list --> | ||
79 | |||
80 | |||
81 | |||
82 | |||
83 | </div> | ||
84 | |||
85 | <div id="validator-badges"> | ||
86 | <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | ||
87 | </div> | ||
88 | |||
89 | </body> | ||
90 | </html> | ||
91 |
doc/files/lib/mirimiri/regexp_rb.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>File: regexp.rb [RDoc Documentation]</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <meta http-equiv="Content-Script-Type" content="text/javascript" /> | ||
8 | <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | ||
9 | <script type="text/javascript"> | ||
10 | // <![CDATA[ | ||
11 | |||
12 | function popupCode( url ) { | ||
13 | window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | ||
14 | } | ||
15 | |||
16 | function toggleCode( id ) { | ||
17 | if ( document.getElementById ) | ||
18 | elem = document.getElementById( id ); | ||
19 | else if ( document.all ) | ||
20 | elem = eval( "document.all." + id ); | ||
21 | else | ||
22 | return false; | ||
23 | |||
24 | elemStyle = elem.style; | ||
25 | |||
26 | if ( elemStyle.display != "block" ) { | ||
27 | elemStyle.display = "block" | ||
28 | } else { | ||
29 | elemStyle.display = "none" | ||
30 | } | ||
31 | |||
32 | return true; | ||
33 | } | ||
34 | |||
35 | // Make codeblocks hidden by default | ||
36 | document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | ||
37 | |||
38 | // ]]> | ||
39 | </script> | ||
40 | |||
41 | </head> | ||
42 | <body> | ||
43 | |||
44 | |||
45 | <div id="fileHeader"> | ||
46 | <h1>regexp.rb</h1> | ||
47 | <table class="header-table"> | ||
48 | <tr class="top-aligned-row"> | ||
49 | <td><strong>Path:</strong></td> | ||
50 | <td>lib/mirimiri/regexp.rb | ||
51 | |||
52 | </td> | ||
53 | </tr> | ||
54 | <tr class="top-aligned-row"> | ||
55 | <td><strong>Last Update:</strong></td> | ||
56 | <td>2010-12-20 10:36:42 +0100</td> | ||
57 | </tr> | ||
58 | </table> | ||
59 | </div> | ||
60 | <!-- banner header --> | ||
61 | |||
62 | <div id="bodyContent"> | ||
63 | |||
64 | <div id="contextContent"> | ||
65 | |||
66 | </div> | ||
67 | |||
68 | |||
69 | </div> | ||
70 | |||
71 | <!-- if includes --> | ||
72 | |||
73 | <div id="section"> | ||
74 | |||
75 | |||
76 | |||
77 | |||
78 | <!-- if method_list --> | ||
79 | |||
80 | |||
81 | |||
82 | |||
83 | </div> | ||
84 | |||
85 | <div id="validator-badges"> | ||
86 | <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | ||
87 | </div> | ||
88 | |||
89 | </body> | ||
90 | </html> | ||
91 |
doc/files/lib/mirimiri/string_rb.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>File: string.rb [RDoc Documentation]</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <meta http-equiv="Content-Script-Type" content="text/javascript" /> | ||
8 | <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | ||
9 | <script type="text/javascript"> | ||
10 | // <![CDATA[ | ||
11 | |||
12 | function popupCode( url ) { | ||
13 | window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | ||
14 | } | ||
15 | |||
16 | function toggleCode( id ) { | ||
17 | if ( document.getElementById ) | ||
18 | elem = document.getElementById( id ); | ||
19 | else if ( document.all ) | ||
20 | elem = eval( "document.all." + id ); | ||
21 | else | ||
22 | return false; | ||
23 | |||
24 | elemStyle = elem.style; | ||
25 | |||
26 | if ( elemStyle.display != "block" ) { | ||
27 | elemStyle.display = "block" | ||
28 | } else { | ||
29 | elemStyle.display = "none" | ||
30 | } | ||
31 | |||
32 | return true; | ||
33 | } | ||
34 | |||
35 | // Make codeblocks hidden by default | ||
36 | document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | ||
37 | |||
38 | // ]]> | ||
39 | </script> | ||
40 | |||
41 | </head> | ||
42 | <body> | ||
43 | |||
44 | |||
45 | <div id="fileHeader"> | ||
46 | <h1>string.rb</h1> | ||
47 | <table class="header-table"> | ||
48 | <tr class="top-aligned-row"> | ||
49 | <td><strong>Path:</strong></td> | ||
50 | <td>lib/mirimiri/string.rb | ||
51 | |||
52 | </td> | ||
53 | </tr> | ||
54 | <tr class="top-aligned-row"> | ||
55 | <td><strong>Last Update:</strong></td> | ||
56 | <td>2010-12-20 10:37:16 +0100</td> | ||
57 | </tr> | ||
58 | </table> | ||
59 | </div> | ||
60 | <!-- banner header --> | ||
61 | |||
62 | <div id="bodyContent"> | ||
63 | |||
64 | <div id="contextContent"> | ||
65 | |||
66 | <div id="description"> | ||
67 | <hr size="1"></hr><p> | ||
68 | General module | ||
69 | </p> | ||
70 | |||
71 | </div> | ||
72 | |||
73 | <div id="requires-list"> | ||
74 | <h3 class="section-bar">Required files</h3> | ||
75 | |||
76 | <div class="name-list"> | ||
77 | |||
78 | cgi | ||
79 | |||
80 | kconv | ||
81 | |||
82 | </div> | ||
83 | </div> | ||
84 | |||
85 | </div> | ||
86 | |||
87 | |||
88 | </div> | ||
89 | |||
90 | <!-- if includes --> | ||
91 | |||
92 | <div id="section"> | ||
93 | |||
94 | |||
95 | |||
96 | |||
97 | <!-- if method_list --> | ||
98 | |||
99 | |||
100 | |||
101 | |||
102 | </div> | ||
103 | |||
104 | <div id="validator-badges"> | ||
105 | <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | ||
106 | </div> | ||
107 | |||
108 | </body> | ||
109 | </html> | ||
110 |
doc/files/lib/mirimiri/ttagger_rb.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>File: ttagger.rb [RDoc Documentation]</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <meta http-equiv="Content-Script-Type" content="text/javascript" /> | ||
8 | <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | ||
9 | <script type="text/javascript"> | ||
10 | // <![CDATA[ | ||
11 | |||
12 | function popupCode( url ) { | ||
13 | window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | ||
14 | } | ||
15 | |||
16 | function toggleCode( id ) { | ||
17 | if ( document.getElementById ) | ||
18 | elem = document.getElementById( id ); | ||
19 | else if ( document.all ) | ||
20 | elem = eval( "document.all." + id ); | ||
21 | else | ||
22 | return false; | ||
23 | |||
24 | elemStyle = elem.style; | ||
25 | |||
26 | if ( elemStyle.display != "block" ) { | ||
27 | elemStyle.display = "block" | ||
28 | } else { | ||
29 | elemStyle.display = "none" | ||
30 | } | ||
31 | |||
32 | return true; | ||
33 | } | ||
34 | |||
35 | // Make codeblocks hidden by default | ||
36 | document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | ||
37 | |||
38 | // ]]> | ||
39 | </script> | ||
40 | |||
41 | </head> | ||
42 | <body> | ||
43 | |||
44 | |||
45 | <div id="fileHeader"> | ||
46 | <h1>ttagger.rb</h1> | ||
47 | <table class="header-table"> | ||
48 | <tr class="top-aligned-row"> | ||
49 | <td><strong>Path:</strong></td> | ||
50 | <td>lib/mirimiri/ttagger.rb | ||
51 | |||
52 | </td> | ||
53 | </tr> | ||
54 | <tr class="top-aligned-row"> | ||
55 | <td><strong>Last Update:</strong></td> | ||
56 | <td>2010-12-20 10:37:32 +0100</td> | ||
57 | </tr> | ||
58 | </table> | ||
59 | </div> | ||
60 | <!-- banner header --> | ||
61 | |||
62 | <div id="bodyContent"> | ||
63 | |||
64 | <div id="contextContent"> | ||
65 | |||
66 | </div> | ||
67 | |||
68 | |||
69 | </div> | ||
70 | |||
71 | <!-- if includes --> | ||
72 | |||
73 | <div id="section"> | ||
74 | |||
75 | |||
76 | |||
77 | |||
78 | <!-- if method_list --> | ||
79 | |||
80 | |||
81 | |||
82 | |||
83 | </div> | ||
84 | |||
85 | <div id="validator-badges"> | ||
86 | <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | ||
87 | </div> | ||
88 | |||
89 | </body> | ||
90 | </html> | ||
91 |
doc/files/lib/mirimiri_rb.html
File was created | 1 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | ||
3 | <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | ||
4 | <head> | ||
5 | <title>File: mirimiri.rb [RDoc Documentation]</title> | ||
6 | <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | ||
7 | <meta http-equiv="Content-Script-Type" content="text/javascript" /> | ||
8 | <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" /> | ||
9 | <script type="text/javascript"> | ||
10 | // <![CDATA[ | ||
11 | |||
12 | function popupCode( url ) { | ||
13 | window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | ||
14 | } | ||
15 | |||
16 | function toggleCode( id ) { | ||
17 | if ( document.getElementById ) | ||
18 | elem = document.getElementById( id ); | ||
19 | else if ( document.all ) | ||
20 | elem = eval( "document.all." + id ); | ||
21 | else | ||
22 | return false; | ||
23 | |||
24 | elemStyle = elem.style; | ||
25 | |||
26 | if ( elemStyle.display != "block" ) { | ||
27 | elemStyle.display = "block" | ||
28 | } else { | ||
29 | elemStyle.display = "none" | ||
30 | } | ||
31 | |||
32 | return true; | ||
33 | } | ||
34 | |||
35 | // Make codeblocks hidden by default | ||
36 | document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | ||
37 | |||
38 | // ]]> | ||
39 | </script> | ||
40 | |||
41 | </head> | ||
42 | <body> | ||
43 | |||
44 | |||
45 | <div id="fileHeader"> | ||
46 | <h1>mirimiri.rb</h1> | ||
47 | <table class="header-table"> | ||
48 | <tr class="top-aligned-row"> | ||
49 | <td><strong>Path:</strong></td> | ||
50 | <td>lib/mirimiri.rb | ||
51 | |||
52 | </td> | ||
53 | </tr> | ||
54 | <tr class="top-aligned-row"> | ||
55 | <td><strong>Last Update:</strong></td> | ||
56 | <td>2010-12-20 10:33:51 +0100</td> | ||
57 | </tr> | ||
58 | </table> | ||
59 | </div> | ||
60 | <!-- banner header --> | ||
61 | |||
62 | <div id="bodyContent"> | ||
63 | |||
64 | <div id="contextContent"> | ||
65 | |||
66 | <div id="requires-list"> | ||
67 | <h3 class="section-bar">Required files</h3> | ||
68 | |||
69 | <div class="name-list"> | ||
70 | |||
71 | rir/document | ||
72 | |||
73 | rir/string | ||
74 | |||
75 | rir/query | ||
76 | |||
77 | rir/corpus | ||
78 | |||
79 | rir/regexp | ||
80 | |||
81 | rir/ttagger | ||
82 | |||
83 | </div> | ||
84 | </div> | ||
85 | |||
86 | </div> | ||
87 | |||
88 | |||
89 | </div> | ||
90 | |||
91 | <!-- if includes --> | ||
92 | |||
93 | <div id="section"> | ||
94 | |||
95 | |||
96 | |||
97 | |||
98 | <!-- if method_list --> | ||
99 | |||
100 | |||
101 | |||
102 | |||
103 | </div> | ||
104 | |||
105 | <div id="validator-badges"> | ||
106 | <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | ||
107 | </div> | ||
108 | |||
109 | </body> | ||
110 | </html> | ||
111 |
lib/mirimiri.rb
File was created | 1 | #!/usr/bin/env ruby | |
2 | |||
3 | require 'mirimiri/document' | ||
4 | require 'mirimiri/string' | ||
5 | require 'mirimiri/query' | ||
6 | require 'mirimiri/corpus' | ||
7 | require 'mirimiri/regexp' | ||
8 | require 'mirimiri/ttagger' | ||
9 |
lib/mirimiri/corpus.rb
File was created | 1 | #!/usr/bin/env ruby | |
2 | |||
3 | #-- | ||
4 | # This file is a part of the mirimiri library | ||
5 | # | ||
6 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | ||
7 | # | ||
8 | # This program is free software: you can redistribute it and/or modify | ||
9 | # it under the terms of the GNU General Public License as published by | ||
10 | # the Free Software Foundation, either version 3 of the License, or | ||
11 | # (at your option) any later version. | ||
12 | # | ||
13 | # This program is distributed in the hope that it will be useful, | ||
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | # GNU General Public License for more details. | ||
17 | # | ||
18 | # You should have received a copy of the GNU General Public License | ||
19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | #++ | ||
21 | |||
22 | class Corpus | ||
23 | attr_accessor :path | ||
24 | |||
25 | def initialize(path) | ||
26 | @path = path.chomp "/" | ||
27 | end | ||
28 | |||
29 | # Recursively outputs all files in +self.path+. | ||
30 | # WARNING ! This function may take a lot of time if many | ||
31 | # files are in subdirectories. | ||
32 | # | ||
33 | # c = Corpus.new "my/path" | ||
34 | # c.files # => ["README.txt", "lib/code.rb"] | ||
35 | def files | ||
36 | Dir["#{@path}/**/*.*"] | ||
37 | end | ||
38 | end | ||
39 |
lib/mirimiri/document.rb
File was created | 1 | #!/usr/bin/env ruby | |
2 | |||
3 | #-- | ||
4 | # This file is a part of the mirimiri library | ||
5 | # | ||
6 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | ||
7 | # | ||
8 | # This program is free software: you can redistribute it and/or modify | ||
9 | # it under the terms of the GNU General Public License as published by | ||
10 | # the Free Software Foundation, either version 3 of the License, or | ||
11 | # (at your option) any later version. | ||
12 | # | ||
13 | # This program is distributed in the hope that it will be useful, | ||
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | # GNU General Public License for more details. | ||
17 | # | ||
18 | # You should have received a copy of the GNU General Public License | ||
19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | #++ | ||
21 | |||
22 | |||
23 | # General module | ||
24 | module Mirimiri | ||
25 | |||
26 | # A Document is a bag of words and is constructed from a string. | ||
27 | class Document | ||
28 | attr_reader :words, :doc_content | ||
29 | |||
30 | # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html | ||
31 | # and the \\W special escape). | ||
32 | # | ||
33 | # Protected function, only meant to by called at the initialization. | ||
34 | def format_words | ||
35 | wo = [] | ||
36 | |||
37 | @doc_content.split.each do |w| | ||
38 | w.split(/\W/).each do |sw| | ||
39 | wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ | ||
40 | end | ||
41 | end | ||
42 | |||
43 | wo | ||
44 | end | ||
45 | |||
46 | # Returns an Array containing the +n+-grams (words) from the current Document. | ||
47 | # | ||
48 | # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] | ||
49 | def ngrams(n) | ||
50 | window = [] | ||
51 | ngrams_array = [] | ||
52 | |||
53 | @words.each do |w| | ||
54 | window.push(w) | ||
55 | if window.size == n | ||
56 | ngrams_array.push window.join(" ") | ||
57 | window.delete_at(0) | ||
58 | end | ||
59 | end | ||
60 | |||
61 | ngrams_array.uniq | ||
62 | end | ||
63 | |||
64 | # Returns a Hash containing the words and their associated counts in the current Document. | ||
65 | # | ||
66 | # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } | ||
67 | def count_words | ||
68 | counts = Hash.new { |h,k| h[k] = 0 } | ||
69 | @words.each { |w| counts[w] += 1 } | ||
70 | |||
71 | counts | ||
72 | end | ||
73 | |||
74 | # Computes the entropy of a given string +s+ inside the document. | ||
75 | # | ||
76 | # If the string parameter is composed of many words (i.e. tokens separated | ||
77 | # by whitespace(s)), it is considered as an ngram. | ||
78 | # | ||
79 | # entropy("guitar") #=> 0.00432114812727959 | ||
80 | # entropy("dillinger escape plan") #=> 0.265862076325102 | ||
81 | def entropy(s) | ||
82 | en = 0.0 | ||
83 | counts = self.count_words | ||
84 | |||
85 | s.split.each do |w| | ||
86 | p_wi = counts[w].to_f/@words.count.to_f | ||
87 | en += p_wi*Math.log2(p_wi) | ||
88 | end | ||
89 | |||
90 | en *= -1 | ||
91 | en | ||
92 | end | ||
93 | |||
94 | # Computes the term frequency of a given *word* +s+. | ||
95 | # | ||
96 | # tf("guitar") #=> 0.000380372765310004 | ||
97 | def tf(s) | ||
98 | self.count_words[s].to_f/@words.size.to_f | ||
99 | end | ||
100 | |||
101 | |||
102 | def initialize(content) | ||
103 | @doc_content = content | ||
104 | @words = format_words | ||
105 | end | ||
106 | |||
107 | protected :format_words | ||
108 | end | ||
109 | |||
110 | # A WebDocument is a Document with a +url+. | ||
111 | class WebDocument < Document | ||
112 | attr_reader :url | ||
113 | |||
114 | # Returns the HTML text from the page of a given +url+. | ||
115 | def self.get_content(url) | ||
116 | require 'net/http' | ||
117 | Net::HTTP.get(URI.parse(url)) | ||
118 | end | ||
119 | |||
120 | # WebDocument constructor, the content of the Document is the HTML page | ||
121 | # without the tags. | ||
122 | def initialize(url) | ||
123 | @url = url | ||
124 | super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags | ||
125 | end | ||
126 | end | ||
127 | |||
128 | # A WikipediaPage is a WebDocument. | ||
129 | class WikipediaPage < WebDocument | ||
130 | require 'rexml/document' | ||
131 | require 'net/http' | ||
132 | require 'kconv' | ||
133 | |||
134 | |||
135 | def self.search_wikipedia_titles(name) | ||
136 | raise ArgumentError, "Bad encoding", name unless name.isutf8 | ||
137 | |||
138 | res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search'] | ||
139 | |||
140 | res.collect { |e| e.attributes['title'] } unless res.nil? | ||
141 | end | ||
142 | |||
143 | def self.get_url(name) | ||
144 | raise ArgumentError, "Bad encoding", name unless name.isutf8 | ||
145 | |||
146 | atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes | ||
147 | |||
148 | atts['fullurl'] if atts['missing'].nil? | ||
149 | end | ||
150 | |||
151 | def self.search_homepage(name) | ||
152 | title = WikipediaPage.search_wikipedia_titles name | ||
153 | |||
154 | WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? | ||
155 | end | ||
156 | |||
157 | # def initialize(name) | ||
158 | # title = WikipediaPage.search_wikipedia_titles name | ||
159 | # raise ArgumentError, "No page found" if title.empty? | ||
160 | # super WikipediaPage.get_url title[0] | ||
161 | # end | ||
162 | end | ||
163 | end | ||
164 |
lib/mirimiri/query.rb
File was created | 1 | #!/usr/bin/env ruby | |
2 | |||
3 | #-- | ||
4 | # This file is a part of the mirimiri library | ||
5 | # | ||
6 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | ||
7 | # | ||
8 | # This program is free software: you can redistribute it and/or modify | ||
9 | # it under the terms of the GNU General Public License as published by | ||
10 | # the Free Software Foundation, either version 3 of the License, or | ||
11 | # (at your option) any later version. | ||
12 | # | ||
13 | # This program is distributed in the hope that it will be useful, | ||
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | # GNU General Public License for more details. | ||
17 | # | ||
18 | # You should have received a copy of the GNU General Public License | ||
19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | #++ | ||
21 | |||
22 | class Query | ||
23 | end | ||
24 | |||
25 | module Indri | ||
26 | |||
27 | class Parameters | ||
28 | attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline | ||
29 | |||
30 | def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false) | ||
31 | @index_path = corpus | ||
32 | @memory = mem | ||
33 | @count = count | ||
34 | @offset = offset | ||
35 | @run_id = run_id | ||
36 | @print_query = print_query ? "true" : "false" | ||
37 | @print_docs = print_docs ? "true" : "false" | ||
38 | end | ||
39 | |||
40 | def to_s | ||
41 | h = "<parameters>\n" | ||
42 | h += "<memory>#{@memory}</memory>\n" | ||
43 | h += "<index>#{@index_path}</index>\n" | ||
44 | h += "<count>#{@count}</count>\n" | ||
45 | unless @baseline.nil? | ||
46 | h += "<baseline>#{@baseline}</baseline>\n" | ||
47 | else | ||
48 | h += "<rule>#{@rule}</rule>\n" | ||
49 | end | ||
50 | h += "<queryOffset>#{@offset}</queryOffset>\n" | ||
51 | h += "<runID>#{@run_id}</runID>\n" | ||
52 | h += "<printQuery>#{@print_query}</printQuery>\n" | ||
53 | h += "<printDocuments>#{@print_docs}</printDocuments>\n" | ||
54 | |||
55 | h | ||
56 | end | ||
57 | end | ||
58 | |||
59 | class IndriQuery < Query | ||
60 | attr_accessor :id, :query, :params, :rule | ||
61 | |||
62 | def initialize(id,query,params) | ||
63 | @params = params | ||
64 | # Here we set the default retrieval model as Language Modeling | ||
65 | # with a Dirichlet smoothing at 2500. | ||
66 | # TODO: maybe a Rule class... | ||
67 | @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? | ||
68 | |||
69 | @id = id | ||
70 | @query = query | ||
71 | end | ||
72 | |||
73 | def to_s | ||
74 | h = @params.to_s | ||
75 | h += "<query>\n" | ||
76 | h += "<number>#{@id}</number>\n" | ||
77 | h += "<text>#{@query}</text>\n" | ||
78 | h += "</query>\n" | ||
79 | h += "</parameters>" | ||
80 | |||
81 | h | ||
82 | end | ||
83 | end | ||
84 | |||
85 | end | ||
86 |
lib/mirimiri/regexp.rb
File was created | 1 | #!/usr/bin/env ruby | |
2 | |||
3 | #-- | ||
4 | # This file is a part of the mirimiri library | ||
5 | # | ||
6 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | ||
7 | # | ||
8 | # This program is free software: you can redistribute it and/or modify | ||
9 | # it under the terms of the GNU General Public License as published by | ||
10 | # the Free Software Foundation, either version 3 of the License, or | ||
11 | # (at your option) any later version. | ||
12 | # | ||
13 | # This program is distributed in the hope that it will be useful, | ||
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | # GNU General Public License for more details. | ||
17 | # | ||
18 | # You should have received a copy of the GNU General Public License | ||
19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | #++ | ||
21 | |||
22 | class Regexp | ||
23 | |||
24 | def negated | ||
25 | /^((?!#{self}).)*$/ | ||
26 | end | ||
27 | |||
28 | end | ||
29 |
lib/mirimiri/string.rb
File was created | 1 | #!/usr/bin/env ruby | |
2 | |||
3 | #-- | ||
4 | # This file is a part of the mirimiri library | ||
5 | # | ||
6 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | ||
7 | # | ||
8 | # This program is free software: you can redistribute it and/or modify | ||
9 | # it under the terms of the GNU General Public License as published by | ||
10 | # the Free Software Foundation, either version 3 of the License, or | ||
11 | # (at your option) any later version. | ||
12 | # | ||
13 | # This program is distributed in the hope that it will be useful, | ||
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | # GNU General Public License for more details. | ||
17 | # | ||
18 | # You should have received a copy of the GNU General Public License | ||
19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | #++ | ||
21 | |||
22 | module Mirimiri | ||
23 | |||
24 | # These are the default stopwords provided by Lemur. | ||
25 | Stoplist = [ | ||
26 | "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av", | ||
27 | "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", | ||
28 | "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", | ||
29 | "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu", | ||
30 | "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during", | ||
31 | "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every", | ||
32 | "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting", | ||
33 | "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff", | ||
34 | "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore", | ||
35 | "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he", | ||
36 | "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto", | ||
37 | "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto", | ||
38 | "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include", | ||
39 | "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into", | ||
40 | "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last", | ||
41 | "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe", | ||
42 | "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs", | ||
43 | "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless", | ||
44 | "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing", | ||
45 | "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once", | ||
46 | "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", | ||
47 | "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite", | ||
48 | "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing", | ||
49 | "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt", | ||
50 | "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote", | ||
51 | "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", | ||
52 | "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave", | ||
53 | "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them", | ||
54 | "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts", | ||
55 | "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", | ||
56 | "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru", | ||
57 | "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh", | ||
58 | "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward", | ||
59 | "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week", | ||
60 | "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", | ||
61 | "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore", | ||
62 | "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", | ||
63 | "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever", | ||
64 | "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom", | ||
65 | "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within", | ||
66 | "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your", | ||
67 | "yours", "yourself", "yourselves" | ||
68 | ] | ||
69 | |||
70 | |||
71 | end | ||
72 | |||
73 | # Extention of the standard class String with useful function. | ||
74 | class String | ||
75 | include Mirimiri | ||
76 | |||
77 | # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise. | ||
78 | def is_stopword? | ||
79 | Stoplist.include?(self.downcase) | ||
80 | end | ||
81 | |||
82 | # Do not use. | ||
83 | # TODO: rewamp. find why this function is here. | ||
84 | def remove_special_characters | ||
85 | self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ') | ||
86 | end | ||
87 | |||
88 | # Removes all XML-like tags from +self+. | ||
89 | # | ||
90 | # s = "<html><body>test</body></html>" | ||
91 | # s.strip_xml_tags! | ||
92 | # s #=> "test" | ||
93 | def strip_xml_tags! | ||
94 | replace strip_with_pattern /<\/?[^>]*>/ | ||
95 | end | ||
96 | |||
97 | # Removes all XML-like tags from +self+. | ||
98 | # | ||
99 | # s = "<html><body>test</body></html>" | ||
100 | # s.strip_xml_tags #=> "test" | ||
101 | # s #=> "<html><body>test</body></html>" | ||
102 | def strip_xml_tags | ||
103 | dup.strip_xml_tags! | ||
104 | end | ||
105 | |||
106 | # Removes all Javascript sources from +self+. | ||
107 | # | ||
108 | # s = "<script type='text/javascript'> | ||
109 | # var skin='vector', | ||
110 | # stylepath='http://bits.wikimedia.org/skins-1.5' | ||
111 | # </script> | ||
112 | # | ||
113 | # test" | ||
114 | # s.strip_javascripts! | ||
115 | # s #=> "test" | ||
116 | def strip_javascripts! | ||
117 | replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m | ||
118 | end | ||
119 | |||
120 | # Removes all Javascript sources from +self+. | ||
121 | # | ||
122 | # s = "<script type='text/javascript'> | ||
123 | # var skin='vector', | ||
124 | # stylepath='http://bits.wikimedia.org/skins-1.5' | ||
125 | # </script> | ||
126 | # | ||
127 | # test" | ||
128 | # s.strip_javascripts #=> "test" | ||
129 | def strip_javascripts | ||
130 | dup.strip_javascripts! | ||
131 | end | ||
132 | |||
133 | def strip_stylesheets! | ||
134 | # TODO: rewamp. dunno what is it. | ||
135 | replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m | ||
136 | end | ||
137 | |||
138 | def strip_stylesheets | ||
139 | dup.strip_stylesheets! | ||
140 | end | ||
141 | |||
142 | # Removes punctuation from +self+. | ||
143 | # | ||
144 | # s = "hello, world. how are you?!" | ||
145 | # s.strip_punctuation! | ||
146 | # s # => "hello world how are you" | ||
147 | def strip_punctuation! | ||
148 | replace strip_with_pattern /[^a-zA-Z0-9\-\s]/ | ||
149 | end | ||
150 | |||
151 | # Removes punctuation from +self+. | ||
152 | # | ||
153 | # s = "hello, world. how are you?!" | ||
154 | # s.strip_punctuation # => "hello world how are you" | ||
155 | def strip_punctuation | ||
156 | dup.strip_punctuation! | ||
157 | end | ||
158 | |||
159 | # Returns the text values inside all occurences of a XML tag in +self+ | ||
160 | # | ||
161 | # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre" | ||
162 | # s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"] | ||
163 | def extract_xmltags_values(tag_name) | ||
164 | self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten | ||
165 | end | ||
166 | |||
167 | def strip_with_pattern(pattern) | ||
168 | require 'cgi' | ||
169 | require 'kconv' | ||
170 | CGI::unescapeHTML(self.gsub(pattern,"")).toutf8 | ||
171 | end | ||
172 | |||
173 | private :strip_with_pattern | ||
174 | end | ||
175 |
lib/mirimiri/ttagger.rb
File was created | 1 | #!/usr/bin/env ruby | |
2 | |||
3 | #-- | ||
4 | # This file is a part of the mirimiri library | ||
5 | # | ||
6 | # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | ||
7 | # | ||
8 | # This program is free software: you can redistribute it and/or modify | ||
9 | # it under the terms of the GNU General Public License as published by | ||
10 | # the Free Software Foundation, either version 3 of the License, or | ||
11 | # (at your option) any later version. | ||
12 | # | ||
13 | # This program is distributed in the hope that it will be useful, | ||
14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | # GNU General Public License for more details. | ||
17 | # | ||
18 | # You should have received a copy of the GNU General Public License | ||
19 | # along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 | #++ | ||
21 | |||
22 | |||
23 | # TreeTagger-related stuff module. | ||
24 | # | ||
25 | # See http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/DecisionTreeTagger.html | ||
26 | module TreeTagger | ||
27 | |||
28 | # This class handles generic parsing of tagger-chunker outputs. | ||
29 | class TaggerChunker | ||
30 | attr_reader :chunks, :file | ||
31 | |||
32 | |||
33 | # Parses a tagger-chunker output and returns an Array of Chunk. | ||
34 | def self.parse chunk_lines | ||
35 | open = false | ||
36 | tag = nil | ||
37 | |||
38 | chunks = [] | ||
39 | words = [] | ||
40 | |||
41 | chunk_lines.each do |l| | ||
42 | l.chomp! | ||
43 | if l =~ /^<\w+>$/ | ||
44 | open = true | ||
45 | tag = l | ||
46 | elsif l =~ /^<\/\w+>$/ | ||
47 | if !words.empty? && open && l == tag.sub(/</, '</') | ||
48 | open = false | ||
49 | chunks.push Chunk.new(words.join(" "), tag) | ||
50 | words.clear | ||
51 | else | ||
52 | next | ||
53 | end | ||
54 | else | ||
55 | words.push(l.split.first) | ||
56 | end | ||
57 | end | ||
58 | |||
59 | chunks | ||
60 | end | ||
61 | |||
62 | # Initializes parsing. +chunk_file+ is the output of +tagger-chunker-+ and must | ||
63 | # be a valid path to the file. | ||
64 | # | ||
65 | # TaggerChunker.new("ttout/2010020") #=> #<RIR::TreeTagger::TaggerChunker:0x92fd088 @chunks=[#<RIR::TreeTagger::Chunk:0x8ec5a10 @words=["robert", "schumann"], @tag="NC">, ...] ...> | ||
66 | def initialize chunk_file | ||
67 | @chunks = TaggerChunker.parse File.open(chunk_file).readlines | ||
68 | end | ||
69 | |||
70 | end | ||
71 | |||
72 | class TaggerChunkerEnglish < TaggerChunker | ||
73 | end | ||
74 | |||
75 | class TaggerChunkerFrench < TaggerChunker | ||
76 | end | ||
77 | |||
78 | class TaggerChunkerGerman < TaggerChunker | ||
79 | end | ||
80 | |||
81 | # Represents a Chunk extracted when parsing a TaggerChunker file. | ||
82 | class Chunk | ||
83 | attr_reader :words, :tag | ||
84 | |||
85 | # Creates a Chunk. | ||
86 | # | ||
87 | # * +str+ are whitespace-separated terms. | ||
88 | # * +tag+ see : ftp://ftp.ims.uni-stuttgart.de/pub/corpora/chunker-tagset-english.txt | ||
89 | def initialize str,tag | ||
90 | @words = str.split | ||
91 | @tag = tag[1..-2] | ||
92 | end | ||
93 | end | ||
94 | |||
95 | end | ||
96 |