Commit cd74322524114fbad147da48f608384d03e46c58
1 parent
b3995017e6
Exists in
master
adding missing files
Showing 29 changed files with 2390 additions and 0 deletions Side-by-side Diff
- doc/classes/Mirimiri.html
- doc/classes/Mirimiri/Document.html
- doc/classes/Mirimiri/Document.src/M000022.html
- doc/classes/Mirimiri/Document.src/M000023.html
- doc/classes/Mirimiri/Document.src/M000024.html
- doc/classes/Mirimiri/Document.src/M000025.html
- doc/classes/Mirimiri/Document.src/M000026.html
- doc/classes/Mirimiri/Document.src/M000027.html
- doc/classes/Mirimiri/WebDocument.html
- doc/classes/Mirimiri/WebDocument.src/M000028.html
- doc/classes/Mirimiri/WebDocument.src/M000029.html
- doc/classes/Mirimiri/WikipediaPage.html
- doc/classes/Mirimiri/WikipediaPage.src/M000030.html
- doc/classes/Mirimiri/WikipediaPage.src/M000031.html
- doc/classes/Mirimiri/WikipediaPage.src/M000032.html
- doc/files/lib/mirimiri/corpus_rb.html
- doc/files/lib/mirimiri/document_rb.html
- doc/files/lib/mirimiri/query_rb.html
- doc/files/lib/mirimiri/regexp_rb.html
- doc/files/lib/mirimiri/string_rb.html
- doc/files/lib/mirimiri/ttagger_rb.html
- doc/files/lib/mirimiri_rb.html
- lib/mirimiri.rb
- lib/mirimiri/corpus.rb
- lib/mirimiri/document.rb
- lib/mirimiri/query.rb
- lib/mirimiri/regexp.rb
- lib/mirimiri/string.rb
- lib/mirimiri/ttagger.rb
doc/classes/Mirimiri.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>Module: Mirimiri [RDoc Documentation]</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
8 | + <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" /> | |
9 | + <script type="text/javascript"> | |
10 | + // <![CDATA[ | |
11 | + | |
12 | + function popupCode( url ) { | |
13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
14 | + } | |
15 | + | |
16 | + function toggleCode( id ) { | |
17 | + if ( document.getElementById ) | |
18 | + elem = document.getElementById( id ); | |
19 | + else if ( document.all ) | |
20 | + elem = eval( "document.all." + id ); | |
21 | + else | |
22 | + return false; | |
23 | + | |
24 | + elemStyle = elem.style; | |
25 | + | |
26 | + if ( elemStyle.display != "block" ) { | |
27 | + elemStyle.display = "block" | |
28 | + } else { | |
29 | + elemStyle.display = "none" | |
30 | + } | |
31 | + | |
32 | + return true; | |
33 | + } | |
34 | + | |
35 | + // Make codeblocks hidden by default | |
36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
37 | + | |
38 | + // ]]> | |
39 | + </script> | |
40 | + | |
41 | +</head> | |
42 | +<body> | |
43 | + | |
44 | + | |
45 | + <div id="classHeader"> | |
46 | + <table class="header-table"> | |
47 | + <tr class="top-aligned-row"> | |
48 | + <td><strong>Module</strong></td> | |
49 | + <td class="class-name-in-header">Mirimiri</td> | |
50 | + </tr> | |
51 | + <tr class="top-aligned-row"> | |
52 | + <td><strong>In:</strong></td> | |
53 | + <td> | |
54 | + | |
55 | + | |
56 | + <a href="../files/lib/mirimiri/string_rb.html"> | |
57 | + | |
58 | + lib/mirimiri/string.rb | |
59 | + | |
60 | + </a> | |
61 | + | |
62 | + | |
63 | + <br /> | |
64 | + | |
65 | + | |
66 | + <a href="../files/lib/mirimiri/document_rb.html"> | |
67 | + | |
68 | + lib/mirimiri/document.rb | |
69 | + | |
70 | + </a> | |
71 | + | |
72 | + | |
73 | + <br /> | |
74 | + | |
75 | + </td> | |
76 | + </tr> | |
77 | + | |
78 | + | |
79 | + </table> | |
80 | + </div> | |
81 | + <!-- banner header --> | |
82 | + | |
83 | + <div id="bodyContent"> | |
84 | + | |
85 | + <div id="contextContent"> | |
86 | + | |
87 | + <div id="description"> | |
88 | + <hr size="1"></hr><p> | |
89 | +General module | |
90 | +</p> | |
91 | + | |
92 | + </div> | |
93 | + | |
94 | + </div> | |
95 | + | |
96 | + | |
97 | + </div> | |
98 | + | |
99 | + <!-- if includes --> | |
100 | + | |
101 | + <div id="section"> | |
102 | + | |
103 | + <div id="class-list"> | |
104 | + <h3 class="section-bar">Classes and Modules</h3> | |
105 | + | |
106 | + Class <a href="Mirimiri/Document.html" class="link">Mirimiri::Document</a><br /> | |
107 | +Class <a href="Mirimiri/WebDocument.html" class="link">Mirimiri::WebDocument</a><br /> | |
108 | +Class <a href="Mirimiri/WikipediaPage.html" class="link">Mirimiri::WikipediaPage</a><br /> | |
109 | + | |
110 | + </div> | |
111 | + | |
112 | + <div id="constants-list"> | |
113 | + <h3 class="section-bar">Constants</h3> | |
114 | + | |
115 | + <div class="name-list"> | |
116 | + <table summary="Constants"> | |
117 | + | |
118 | + <tr class="top-aligned-row context-row"> | |
119 | + <td class="context-item-name">Stoplist</td> | |
120 | + <td>=</td> | |
121 | + <td class="context-item-value">[ "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu", "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during", "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting", "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff", "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore", "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he", "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto", "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto", "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include", "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into", "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last", "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe", "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs", "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless", "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing", "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite", "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt", "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave", "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them", "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts", "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru", "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh", "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward", "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week", "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore", "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever", "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom", "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within", "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your", "yours", "yourself", "yourselves" ]</td> | |
122 | + | |
123 | + <td> </td> | |
124 | + <td class="context-item-desc"> | |
125 | +These are the default stopwords provided by Lemur. | |
126 | + | |
127 | +</td> | |
128 | + | |
129 | + </tr> | |
130 | + | |
131 | + </table> | |
132 | + </div> | |
133 | + </div> | |
134 | + | |
135 | + | |
136 | + | |
137 | + | |
138 | + <!-- if method_list --> | |
139 | + | |
140 | + | |
141 | + | |
142 | + | |
143 | + </div> | |
144 | + | |
145 | +<div id="validator-badges"> | |
146 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
147 | +</div> | |
148 | + | |
149 | +</body> | |
150 | +</html> |
doc/classes/Mirimiri/Document.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>Class: Mirimiri::Document [RDoc Documentation]</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
8 | + <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" /> | |
9 | + <script type="text/javascript"> | |
10 | + // <![CDATA[ | |
11 | + | |
12 | + function popupCode( url ) { | |
13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
14 | + } | |
15 | + | |
16 | + function toggleCode( id ) { | |
17 | + if ( document.getElementById ) | |
18 | + elem = document.getElementById( id ); | |
19 | + else if ( document.all ) | |
20 | + elem = eval( "document.all." + id ); | |
21 | + else | |
22 | + return false; | |
23 | + | |
24 | + elemStyle = elem.style; | |
25 | + | |
26 | + if ( elemStyle.display != "block" ) { | |
27 | + elemStyle.display = "block" | |
28 | + } else { | |
29 | + elemStyle.display = "none" | |
30 | + } | |
31 | + | |
32 | + return true; | |
33 | + } | |
34 | + | |
35 | + // Make codeblocks hidden by default | |
36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
37 | + | |
38 | + // ]]> | |
39 | + </script> | |
40 | + | |
41 | +</head> | |
42 | +<body> | |
43 | + | |
44 | + | |
45 | + <div id="classHeader"> | |
46 | + <table class="header-table"> | |
47 | + <tr class="top-aligned-row"> | |
48 | + <td><strong>Class</strong></td> | |
49 | + <td class="class-name-in-header">Mirimiri::Document</td> | |
50 | + </tr> | |
51 | + <tr class="top-aligned-row"> | |
52 | + <td><strong>In:</strong></td> | |
53 | + <td> | |
54 | + | |
55 | + | |
56 | + <a href="../../files/lib/mirimiri/document_rb.html"> | |
57 | + | |
58 | + lib/mirimiri/document.rb | |
59 | + | |
60 | + </a> | |
61 | + | |
62 | + | |
63 | + <br /> | |
64 | + | |
65 | + </td> | |
66 | + </tr> | |
67 | + | |
68 | + | |
69 | + <tr class="top-aligned-row"> | |
70 | + <td><strong>Parent:</strong></td> | |
71 | + <td> | |
72 | + | |
73 | + Object | |
74 | + | |
75 | + </td> | |
76 | + </tr> | |
77 | + | |
78 | + </table> | |
79 | + </div> | |
80 | + <!-- banner header --> | |
81 | + | |
82 | + <div id="bodyContent"> | |
83 | + | |
84 | + <div id="contextContent"> | |
85 | + | |
86 | + <div id="description"> | |
87 | + <p> | |
88 | +A <a href="Document.html">Document</a> is a bag of words and is constructed | |
89 | +from a string. | |
90 | +</p> | |
91 | + | |
92 | + </div> | |
93 | + | |
94 | + </div> | |
95 | + | |
96 | + | |
97 | + <div id="method-list"> | |
98 | + <h3 class="section-bar">Methods</h3> | |
99 | + | |
100 | + <div class="name-list"> | |
101 | + | |
102 | + <a href="#M000024">count_words</a> | |
103 | + | |
104 | + <a href="#M000025">entropy</a> | |
105 | + | |
106 | + <a href="#M000022">format_words</a> | |
107 | + | |
108 | + <a href="#M000027">new</a> | |
109 | + | |
110 | + <a href="#M000023">ngrams</a> | |
111 | + | |
112 | + <a href="#M000026">tf</a> | |
113 | + | |
114 | + </div> | |
115 | + </div> | |
116 | + | |
117 | + </div> | |
118 | + | |
119 | + <!-- if includes --> | |
120 | + | |
121 | + <div id="section"> | |
122 | + | |
123 | + | |
124 | + | |
125 | + <div id="attribute-list"> | |
126 | + <h3 class="section-bar">Attributes</h3> | |
127 | + | |
128 | + <div class="name-list"> | |
129 | + <table> | |
130 | + | |
131 | + <tr class="top-aligned-row context-row"> | |
132 | + <td class="context-item-name">doc_content</td> | |
133 | + | |
134 | + <td class="context-item-value"> [R] </td> | |
135 | + | |
136 | + <td class="context-item-desc"></td> | |
137 | + </tr> | |
138 | + | |
139 | + <tr class="top-aligned-row context-row"> | |
140 | + <td class="context-item-name">words</td> | |
141 | + | |
142 | + <td class="context-item-value"> [R] </td> | |
143 | + | |
144 | + <td class="context-item-desc"></td> | |
145 | + </tr> | |
146 | + | |
147 | + </table> | |
148 | + </div> | |
149 | + </div> | |
150 | + | |
151 | + | |
152 | + <!-- if method_list --> | |
153 | + | |
154 | + <div id="methods"> | |
155 | + | |
156 | + <h3 class="section-bar">Public Class methods</h3> | |
157 | + | |
158 | + | |
159 | + <div id="method-M000027" class="method-detail"> | |
160 | + <a name="M000027"></a> | |
161 | + | |
162 | + <div class="method-heading"> | |
163 | + | |
164 | + <a href="Document.src/M000027.html" target="Code" class="method-signature" | |
165 | + onclick="popupCode('Document.src/M000027.html');return false;"> | |
166 | + | |
167 | + <span class="method-name">new</span><span class="method-args">(content)</span> | |
168 | + | |
169 | + </a> | |
170 | + | |
171 | + </div> | |
172 | + | |
173 | + <div class="method-description"> | |
174 | + | |
175 | + </div> | |
176 | + </div> | |
177 | + | |
178 | + | |
179 | + <h3 class="section-bar">Public Instance methods</h3> | |
180 | + | |
181 | + | |
182 | + <div id="method-M000024" class="method-detail"> | |
183 | + <a name="M000024"></a> | |
184 | + | |
185 | + <div class="method-heading"> | |
186 | + | |
187 | + <a href="Document.src/M000024.html" target="Code" class="method-signature" | |
188 | + onclick="popupCode('Document.src/M000024.html');return false;"> | |
189 | + | |
190 | + <span class="method-name">count_words</span><span class="method-args">()</span> | |
191 | + | |
192 | + </a> | |
193 | + | |
194 | + </div> | |
195 | + | |
196 | + <div class="method-description"> | |
197 | + | |
198 | + <p> | |
199 | +Returns a Hash containing the words and their associated counts in the | |
200 | +current <a href="Document.html">Document</a>. | |
201 | +</p> | |
202 | +<pre> | |
203 | + count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } | |
204 | +</pre> | |
205 | + | |
206 | + </div> | |
207 | + </div> | |
208 | + | |
209 | + | |
210 | + <div id="method-M000025" class="method-detail"> | |
211 | + <a name="M000025"></a> | |
212 | + | |
213 | + <div class="method-heading"> | |
214 | + | |
215 | + <a href="Document.src/M000025.html" target="Code" class="method-signature" | |
216 | + onclick="popupCode('Document.src/M000025.html');return false;"> | |
217 | + | |
218 | + <span class="method-name">entropy</span><span class="method-args">(s)</span> | |
219 | + | |
220 | + </a> | |
221 | + | |
222 | + </div> | |
223 | + | |
224 | + <div class="method-description"> | |
225 | + | |
226 | + <p> | |
227 | +Computes the entropy of a given string <tt>s</tt> inside the document. | |
228 | +</p> | |
229 | +<p> | |
230 | +If the string parameter is composed of many words (i.e. tokens separated by | |
231 | +whitespace(s)), it is considered as an ngram. | |
232 | +</p> | |
233 | +<pre> | |
234 | + entropy("guitar") #=> 0.00432114812727959 | |
235 | + entropy("dillinger escape plan") #=> 0.265862076325102 | |
236 | +</pre> | |
237 | + | |
238 | + </div> | |
239 | + </div> | |
240 | + | |
241 | + | |
242 | + <div id="method-M000023" class="method-detail"> | |
243 | + <a name="M000023"></a> | |
244 | + | |
245 | + <div class="method-heading"> | |
246 | + | |
247 | + <a href="Document.src/M000023.html" target="Code" class="method-signature" | |
248 | + onclick="popupCode('Document.src/M000023.html');return false;"> | |
249 | + | |
250 | + <span class="method-name">ngrams</span><span class="method-args">(n)</span> | |
251 | + | |
252 | + </a> | |
253 | + | |
254 | + </div> | |
255 | + | |
256 | + <div class="method-description"> | |
257 | + | |
258 | + <p> | |
259 | +Returns an Array containing the <tt>n</tt>-grams (words) from the current | |
260 | +<a href="Document.html">Document</a>. | |
261 | +</p> | |
262 | +<pre> | |
263 | + ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] | |
264 | +</pre> | |
265 | + | |
266 | + </div> | |
267 | + </div> | |
268 | + | |
269 | + | |
270 | + <div id="method-M000026" class="method-detail"> | |
271 | + <a name="M000026"></a> | |
272 | + | |
273 | + <div class="method-heading"> | |
274 | + | |
275 | + <a href="Document.src/M000026.html" target="Code" class="method-signature" | |
276 | + onclick="popupCode('Document.src/M000026.html');return false;"> | |
277 | + | |
278 | + <span class="method-name">tf</span><span class="method-args">(s)</span> | |
279 | + | |
280 | + </a> | |
281 | + | |
282 | + </div> | |
283 | + | |
284 | + <div class="method-description"> | |
285 | + | |
286 | + <p> | |
287 | +Computes the term frequency of a given <b>word</b> <tt>s</tt>. | |
288 | +</p> | |
289 | +<pre> | |
290 | + tf("guitar") #=> 0.000380372765310004 | |
291 | +</pre> | |
292 | + | |
293 | + </div> | |
294 | + </div> | |
295 | + | |
296 | + | |
297 | + <h3 class="section-bar">Protected Instance methods</h3> | |
298 | + | |
299 | + | |
300 | + <div id="method-M000022" class="method-detail"> | |
301 | + <a name="M000022"></a> | |
302 | + | |
303 | + <div class="method-heading"> | |
304 | + | |
305 | + <a href="Document.src/M000022.html" target="Code" class="method-signature" | |
306 | + onclick="popupCode('Document.src/M000022.html');return false;"> | |
307 | + | |
308 | + <span class="method-name">format_words</span><span class="method-args">()</span> | |
309 | + | |
310 | + </a> | |
311 | + | |
312 | + </div> | |
313 | + | |
314 | + <div class="method-description"> | |
315 | + | |
316 | + <p> | |
317 | +Any non-word characters are removed from the words (see <a | |
318 | +href="http://perldoc.perl.org/perlre.html">perldoc.perl.org/perlre.html</a> | |
319 | +and the W special escape). | |
320 | +</p> | |
321 | +<p> | |
322 | +Protected function, only meant to by called at the initialization. | |
323 | +</p> | |
324 | + | |
325 | + </div> | |
326 | + </div> | |
327 | + | |
328 | + | |
329 | + | |
330 | + </div> | |
331 | + | |
332 | + | |
333 | + | |
334 | + | |
335 | + </div> | |
336 | + | |
337 | +<div id="validator-badges"> | |
338 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
339 | +</div> | |
340 | + | |
341 | +</body> | |
342 | +</html> |
doc/classes/Mirimiri/Document.src/M000022.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>format_words (Mirimiri::Document)</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
8 | +</head> | |
9 | +<body class="standalone-code"> | |
10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 34</span> | |
11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">format_words</span> | |
12 | + <span class="ruby-identifier">wo</span> = [] | |
13 | + | |
14 | + <span class="ruby-ivar">@doc_content</span>.<span class="ruby-identifier">split</span>.<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span> | |
15 | + <span class="ruby-identifier">w</span>.<span class="ruby-identifier">split</span>(<span class="ruby-regexp re">/\W/</span>).<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">sw</span><span class="ruby-operator">|</span> | |
16 | + <span class="ruby-identifier">wo</span>.<span class="ruby-identifier">push</span>(<span class="ruby-identifier">sw</span>.<span class="ruby-identifier">downcase</span>) <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">sw</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/[a-zA-Z]/</span> | |
17 | + <span class="ruby-keyword kw">end</span> | |
18 | + <span class="ruby-keyword kw">end</span> | |
19 | + | |
20 | + <span class="ruby-identifier">wo</span> | |
21 | + <span class="ruby-keyword kw">end</span></pre> | |
22 | +</body> | |
23 | +</html> |
doc/classes/Mirimiri/Document.src/M000023.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>ngrams (Mirimiri::Document)</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
8 | +</head> | |
9 | +<body class="standalone-code"> | |
10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 49</span> | |
11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">ngrams</span>(<span class="ruby-identifier">n</span>) | |
12 | + <span class="ruby-identifier">window</span> = [] | |
13 | + <span class="ruby-identifier">ngrams_array</span> = [] | |
14 | + | |
15 | + <span class="ruby-ivar">@words</span>.<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span> | |
16 | + <span class="ruby-identifier">window</span>.<span class="ruby-identifier">push</span>(<span class="ruby-identifier">w</span>) | |
17 | + <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">window</span>.<span class="ruby-identifier">size</span> <span class="ruby-operator">==</span> <span class="ruby-identifier">n</span> | |
18 | + <span class="ruby-identifier">ngrams_array</span>.<span class="ruby-identifier">push</span> <span class="ruby-identifier">window</span>.<span class="ruby-identifier">join</span>(<span class="ruby-value str">" "</span>) | |
19 | + <span class="ruby-identifier">window</span>.<span class="ruby-identifier">delete_at</span>(<span class="ruby-value">0</span>) | |
20 | + <span class="ruby-keyword kw">end</span> | |
21 | + <span class="ruby-keyword kw">end</span> | |
22 | + | |
23 | + <span class="ruby-identifier">ngrams_array</span>.<span class="ruby-identifier">uniq</span> | |
24 | + <span class="ruby-keyword kw">end</span></pre> | |
25 | +</body> | |
26 | +</html> |
doc/classes/Mirimiri/Document.src/M000024.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>count_words (Mirimiri::Document)</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
8 | +</head> | |
9 | +<body class="standalone-code"> | |
10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 67</span> | |
11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">count_words</span> | |
12 | + <span class="ruby-identifier">counts</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">h</span>,<span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">h</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-value">0</span> } | |
13 | + <span class="ruby-ivar">@words</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span> <span class="ruby-identifier">counts</span>[<span class="ruby-identifier">w</span>] <span class="ruby-operator">+=</span> <span class="ruby-value">1</span> } | |
14 | + | |
15 | + <span class="ruby-identifier">counts</span> | |
16 | + <span class="ruby-keyword kw">end</span></pre> | |
17 | +</body> | |
18 | +</html> |
doc/classes/Mirimiri/Document.src/M000025.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>entropy (Mirimiri::Document)</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
8 | +</head> | |
9 | +<body class="standalone-code"> | |
10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 81</span> | |
11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">entropy</span>(<span class="ruby-identifier">s</span>) | |
12 | + <span class="ruby-identifier">en</span> = <span class="ruby-value">0</span><span class="ruby-value">.0</span> | |
13 | + <span class="ruby-identifier">counts</span> = <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">count_words</span> | |
14 | + | |
15 | + <span class="ruby-identifier">s</span>.<span class="ruby-identifier">split</span>.<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span> | |
16 | + <span class="ruby-identifier">p_wi</span> = <span class="ruby-identifier">counts</span>[<span class="ruby-identifier">w</span>].<span class="ruby-identifier">to_f</span><span class="ruby-operator">/</span><span class="ruby-ivar">@words</span>.<span class="ruby-identifier">count</span>.<span class="ruby-identifier">to_f</span> | |
17 | + <span class="ruby-identifier">en</span> <span class="ruby-operator">+=</span> <span class="ruby-identifier">p_wi</span><span class="ruby-operator">*</span><span class="ruby-constant">Math</span>.<span class="ruby-identifier">log2</span>(<span class="ruby-identifier">p_wi</span>) | |
18 | + <span class="ruby-keyword kw">end</span> | |
19 | + | |
20 | + <span class="ruby-identifier">en</span> <span class="ruby-operator">*=</span> <span class="ruby-value">-1</span> | |
21 | + <span class="ruby-identifier">en</span> | |
22 | + <span class="ruby-keyword kw">end</span></pre> | |
23 | +</body> | |
24 | +</html> |
doc/classes/Mirimiri/Document.src/M000026.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>tf (Mirimiri::Document)</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
8 | +</head> | |
9 | +<body class="standalone-code"> | |
10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 97</span> | |
11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">tf</span>(<span class="ruby-identifier">s</span>) | |
12 | + <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">count_words</span>[<span class="ruby-identifier">s</span>].<span class="ruby-identifier">to_f</span><span class="ruby-operator">/</span><span class="ruby-ivar">@words</span>.<span class="ruby-identifier">size</span>.<span class="ruby-identifier">to_f</span> | |
13 | + <span class="ruby-keyword kw">end</span></pre> | |
14 | +</body> | |
15 | +</html> |
doc/classes/Mirimiri/Document.src/M000027.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>new (Mirimiri::Document)</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
8 | +</head> | |
9 | +<body class="standalone-code"> | |
10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 102</span> | |
11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">content</span>) | |
12 | + <span class="ruby-ivar">@doc_content</span> = <span class="ruby-identifier">content</span> | |
13 | + <span class="ruby-ivar">@words</span> = <span class="ruby-identifier">format_words</span> | |
14 | + <span class="ruby-keyword kw">end</span></pre> | |
15 | +</body> | |
16 | +</html> |
doc/classes/Mirimiri/WebDocument.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>Class: Mirimiri::WebDocument [RDoc Documentation]</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
8 | + <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" /> | |
9 | + <script type="text/javascript"> | |
10 | + // <![CDATA[ | |
11 | + | |
12 | + function popupCode( url ) { | |
13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
14 | + } | |
15 | + | |
16 | + function toggleCode( id ) { | |
17 | + if ( document.getElementById ) | |
18 | + elem = document.getElementById( id ); | |
19 | + else if ( document.all ) | |
20 | + elem = eval( "document.all." + id ); | |
21 | + else | |
22 | + return false; | |
23 | + | |
24 | + elemStyle = elem.style; | |
25 | + | |
26 | + if ( elemStyle.display != "block" ) { | |
27 | + elemStyle.display = "block" | |
28 | + } else { | |
29 | + elemStyle.display = "none" | |
30 | + } | |
31 | + | |
32 | + return true; | |
33 | + } | |
34 | + | |
35 | + // Make codeblocks hidden by default | |
36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
37 | + | |
38 | + // ]]> | |
39 | + </script> | |
40 | + | |
41 | +</head> | |
42 | +<body> | |
43 | + | |
44 | + | |
45 | + <div id="classHeader"> | |
46 | + <table class="header-table"> | |
47 | + <tr class="top-aligned-row"> | |
48 | + <td><strong>Class</strong></td> | |
49 | + <td class="class-name-in-header">Mirimiri::WebDocument</td> | |
50 | + </tr> | |
51 | + <tr class="top-aligned-row"> | |
52 | + <td><strong>In:</strong></td> | |
53 | + <td> | |
54 | + | |
55 | + | |
56 | + <a href="../../files/lib/mirimiri/document_rb.html"> | |
57 | + | |
58 | + lib/mirimiri/document.rb | |
59 | + | |
60 | + </a> | |
61 | + | |
62 | + | |
63 | + <br /> | |
64 | + | |
65 | + </td> | |
66 | + </tr> | |
67 | + | |
68 | + | |
69 | + <tr class="top-aligned-row"> | |
70 | + <td><strong>Parent:</strong></td> | |
71 | + <td> | |
72 | + | |
73 | + <a href="Document.html"> | |
74 | + | |
75 | + Mirimiri::Document | |
76 | + | |
77 | + </a> | |
78 | + | |
79 | + </td> | |
80 | + </tr> | |
81 | + | |
82 | + </table> | |
83 | + </div> | |
84 | + <!-- banner header --> | |
85 | + | |
86 | + <div id="bodyContent"> | |
87 | + | |
88 | + <div id="contextContent"> | |
89 | + | |
90 | + <div id="description"> | |
91 | + <p> | |
92 | +A <a href="WebDocument.html">WebDocument</a> is a <a | |
93 | +href="Document.html">Document</a> with a <tt>url</tt>. | |
94 | +</p> | |
95 | + | |
96 | + </div> | |
97 | + | |
98 | + </div> | |
99 | + | |
100 | + | |
101 | + <div id="method-list"> | |
102 | + <h3 class="section-bar">Methods</h3> | |
103 | + | |
104 | + <div class="name-list"> | |
105 | + | |
106 | + <a href="#M000028">get_content</a> | |
107 | + | |
108 | + <a href="#M000029">new</a> | |
109 | + | |
110 | + </div> | |
111 | + </div> | |
112 | + | |
113 | + </div> | |
114 | + | |
115 | + <!-- if includes --> | |
116 | + | |
117 | + <div id="section"> | |
118 | + | |
119 | + | |
120 | + | |
121 | + <div id="attribute-list"> | |
122 | + <h3 class="section-bar">Attributes</h3> | |
123 | + | |
124 | + <div class="name-list"> | |
125 | + <table> | |
126 | + | |
127 | + <tr class="top-aligned-row context-row"> | |
128 | + <td class="context-item-name">url</td> | |
129 | + | |
130 | + <td class="context-item-value"> [R] </td> | |
131 | + | |
132 | + <td class="context-item-desc"></td> | |
133 | + </tr> | |
134 | + | |
135 | + </table> | |
136 | + </div> | |
137 | + </div> | |
138 | + | |
139 | + | |
140 | + <!-- if method_list --> | |
141 | + | |
142 | + <div id="methods"> | |
143 | + | |
144 | + <h3 class="section-bar">Public Class methods</h3> | |
145 | + | |
146 | + | |
147 | + <div id="method-M000028" class="method-detail"> | |
148 | + <a name="M000028"></a> | |
149 | + | |
150 | + <div class="method-heading"> | |
151 | + | |
152 | + <a href="WebDocument.src/M000028.html" target="Code" class="method-signature" | |
153 | + onclick="popupCode('WebDocument.src/M000028.html');return false;"> | |
154 | + | |
155 | + <span class="method-name">get_content</span><span class="method-args">(url)</span> | |
156 | + | |
157 | + </a> | |
158 | + | |
159 | + </div> | |
160 | + | |
161 | + <div class="method-description"> | |
162 | + | |
163 | + <p> | |
164 | +Returns the HTML text from the page of a given <tt>url</tt>. | |
165 | +</p> | |
166 | + | |
167 | + </div> | |
168 | + </div> | |
169 | + | |
170 | + | |
171 | + <div id="method-M000029" class="method-detail"> | |
172 | + <a name="M000029"></a> | |
173 | + | |
174 | + <div class="method-heading"> | |
175 | + | |
176 | + <a href="WebDocument.src/M000029.html" target="Code" class="method-signature" | |
177 | + onclick="popupCode('WebDocument.src/M000029.html');return false;"> | |
178 | + | |
179 | + <span class="method-name">new</span><span class="method-args">(url)</span> | |
180 | + | |
181 | + </a> | |
182 | + | |
183 | + </div> | |
184 | + | |
185 | + <div class="method-description"> | |
186 | + | |
187 | + <p> | |
188 | +<a href="WebDocument.html">WebDocument</a> constructor, the content of the | |
189 | +<a href="Document.html">Document</a> is the HTML page without the tags. | |
190 | +</p> | |
191 | + | |
192 | + </div> | |
193 | + </div> | |
194 | + | |
195 | + | |
196 | + | |
197 | + </div> | |
198 | + | |
199 | + | |
200 | + | |
201 | + | |
202 | + </div> | |
203 | + | |
204 | +<div id="validator-badges"> | |
205 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
206 | +</div> | |
207 | + | |
208 | +</body> | |
209 | +</html> |
doc/classes/Mirimiri/WebDocument.src/M000028.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>get_content (Mirimiri::WebDocument)</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
8 | +</head> | |
9 | +<body class="standalone-code"> | |
10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 115</span> | |
11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">get_content</span>(<span class="ruby-identifier">url</span>) | |
12 | + <span class="ruby-identifier">require</span> <span class="ruby-value str">'net/http'</span> | |
13 | + <span class="ruby-constant">Net</span><span class="ruby-operator">::</span><span class="ruby-constant">HTTP</span>.<span class="ruby-identifier">get</span>(<span class="ruby-constant">URI</span>.<span class="ruby-identifier">parse</span>(<span class="ruby-identifier">url</span>)) | |
14 | + <span class="ruby-keyword kw">end</span></pre> | |
15 | +</body> | |
16 | +</html> |
doc/classes/Mirimiri/WebDocument.src/M000029.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>new (Mirimiri::WebDocument)</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
8 | +</head> | |
9 | +<body class="standalone-code"> | |
10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 122</span> | |
11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">url</span>) | |
12 | + <span class="ruby-ivar">@url</span> = <span class="ruby-identifier">url</span> | |
13 | + <span class="ruby-keyword kw">super</span> <span class="ruby-constant">WebDocument</span>.<span class="ruby-identifier">get_content</span>(<span class="ruby-identifier">url</span>).<span class="ruby-identifier">strip_javascripts</span>.<span class="ruby-identifier">strip_stylesheets</span>.<span class="ruby-identifier">strip_xml_tags</span> | |
14 | + <span class="ruby-keyword kw">end</span></pre> | |
15 | +</body> | |
16 | +</html> |
doc/classes/Mirimiri/WikipediaPage.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>Class: Mirimiri::WikipediaPage [RDoc Documentation]</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
8 | + <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" /> | |
9 | + <script type="text/javascript"> | |
10 | + // <![CDATA[ | |
11 | + | |
12 | + function popupCode( url ) { | |
13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
14 | + } | |
15 | + | |
16 | + function toggleCode( id ) { | |
17 | + if ( document.getElementById ) | |
18 | + elem = document.getElementById( id ); | |
19 | + else if ( document.all ) | |
20 | + elem = eval( "document.all." + id ); | |
21 | + else | |
22 | + return false; | |
23 | + | |
24 | + elemStyle = elem.style; | |
25 | + | |
26 | + if ( elemStyle.display != "block" ) { | |
27 | + elemStyle.display = "block" | |
28 | + } else { | |
29 | + elemStyle.display = "none" | |
30 | + } | |
31 | + | |
32 | + return true; | |
33 | + } | |
34 | + | |
35 | + // Make codeblocks hidden by default | |
36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
37 | + | |
38 | + // ]]> | |
39 | + </script> | |
40 | + | |
41 | +</head> | |
42 | +<body> | |
43 | + | |
44 | + | |
45 | + <div id="classHeader"> | |
46 | + <table class="header-table"> | |
47 | + <tr class="top-aligned-row"> | |
48 | + <td><strong>Class</strong></td> | |
49 | + <td class="class-name-in-header">Mirimiri::WikipediaPage</td> | |
50 | + </tr> | |
51 | + <tr class="top-aligned-row"> | |
52 | + <td><strong>In:</strong></td> | |
53 | + <td> | |
54 | + | |
55 | + | |
56 | + <a href="../../files/lib/mirimiri/document_rb.html"> | |
57 | + | |
58 | + lib/mirimiri/document.rb | |
59 | + | |
60 | + </a> | |
61 | + | |
62 | + | |
63 | + <br /> | |
64 | + | |
65 | + </td> | |
66 | + </tr> | |
67 | + | |
68 | + | |
69 | + <tr class="top-aligned-row"> | |
70 | + <td><strong>Parent:</strong></td> | |
71 | + <td> | |
72 | + | |
73 | + <a href="WebDocument.html"> | |
74 | + | |
75 | + Mirimiri::WebDocument | |
76 | + | |
77 | + </a> | |
78 | + | |
79 | + </td> | |
80 | + </tr> | |
81 | + | |
82 | + </table> | |
83 | + </div> | |
84 | + <!-- banner header --> | |
85 | + | |
86 | + <div id="bodyContent"> | |
87 | + | |
88 | + <div id="contextContent"> | |
89 | + | |
90 | + <div id="description"> | |
91 | + <p> | |
92 | +A <a href="WikipediaPage.html">WikipediaPage</a> is a <a | |
93 | +href="WebDocument.html">WebDocument</a>. | |
94 | +</p> | |
95 | + | |
96 | + </div> | |
97 | + | |
98 | + </div> | |
99 | + | |
100 | + | |
101 | + <div id="method-list"> | |
102 | + <h3 class="section-bar">Methods</h3> | |
103 | + | |
104 | + <div class="name-list"> | |
105 | + | |
106 | + <a href="#M000031">get_url</a> | |
107 | + | |
108 | + <a href="#M000032">search_homepage</a> | |
109 | + | |
110 | + <a href="#M000030">search_wikipedia_titles</a> | |
111 | + | |
112 | + </div> | |
113 | + </div> | |
114 | + | |
115 | + </div> | |
116 | + | |
117 | + <!-- if includes --> | |
118 | + | |
119 | + <div id="section"> | |
120 | + | |
121 | + | |
122 | + | |
123 | + | |
124 | + <!-- if method_list --> | |
125 | + | |
126 | + <div id="methods"> | |
127 | + | |
128 | + <h3 class="section-bar">Public Class methods</h3> | |
129 | + | |
130 | + | |
131 | + <div id="method-M000031" class="method-detail"> | |
132 | + <a name="M000031"></a> | |
133 | + | |
134 | + <div class="method-heading"> | |
135 | + | |
136 | + <a href="WikipediaPage.src/M000031.html" target="Code" class="method-signature" | |
137 | + onclick="popupCode('WikipediaPage.src/M000031.html');return false;"> | |
138 | + | |
139 | + <span class="method-name">get_url</span><span class="method-args">(name)</span> | |
140 | + | |
141 | + </a> | |
142 | + | |
143 | + </div> | |
144 | + | |
145 | + <div class="method-description"> | |
146 | + | |
147 | + </div> | |
148 | + </div> | |
149 | + | |
150 | + | |
151 | + <div id="method-M000032" class="method-detail"> | |
152 | + <a name="M000032"></a> | |
153 | + | |
154 | + <div class="method-heading"> | |
155 | + | |
156 | + <a href="WikipediaPage.src/M000032.html" target="Code" class="method-signature" | |
157 | + onclick="popupCode('WikipediaPage.src/M000032.html');return false;"> | |
158 | + | |
159 | + <span class="method-name">search_homepage</span><span class="method-args">(name)</span> | |
160 | + | |
161 | + </a> | |
162 | + | |
163 | + </div> | |
164 | + | |
165 | + <div class="method-description"> | |
166 | + | |
167 | + </div> | |
168 | + </div> | |
169 | + | |
170 | + | |
171 | + <div id="method-M000030" class="method-detail"> | |
172 | + <a name="M000030"></a> | |
173 | + | |
174 | + <div class="method-heading"> | |
175 | + | |
176 | + <a href="WikipediaPage.src/M000030.html" target="Code" class="method-signature" | |
177 | + onclick="popupCode('WikipediaPage.src/M000030.html');return false;"> | |
178 | + | |
179 | + <span class="method-name">search_wikipedia_titles</span><span class="method-args">(name)</span> | |
180 | + | |
181 | + </a> | |
182 | + | |
183 | + </div> | |
184 | + | |
185 | + <div class="method-description"> | |
186 | + | |
187 | + </div> | |
188 | + </div> | |
189 | + | |
190 | + | |
191 | + | |
192 | + </div> | |
193 | + | |
194 | + | |
195 | + | |
196 | + | |
197 | + </div> | |
198 | + | |
199 | +<div id="validator-badges"> | |
200 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
201 | +</div> | |
202 | + | |
203 | +</body> | |
204 | +</html> |
doc/classes/Mirimiri/WikipediaPage.src/M000030.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>search_wikipedia_titles (Mirimiri::WikipediaPage)</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
8 | +</head> | |
9 | +<body class="standalone-code"> | |
10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 135</span> | |
11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">search_wikipedia_titles</span>(<span class="ruby-identifier">name</span>) | |
12 | + <span class="ruby-identifier">raise</span> <span class="ruby-constant">ArgumentError</span>, <span class="ruby-value str">"Bad encoding"</span>, <span class="ruby-identifier">name</span> <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">name</span>.<span class="ruby-identifier">isutf8</span> | |
13 | + | |
14 | + <span class="ruby-identifier">res</span> = <span class="ruby-constant">REXML</span><span class="ruby-operator">::</span><span class="ruby-constant">Document</span>.<span class="ruby-identifier">new</span>(<span class="ruby-constant">Net</span><span class="ruby-operator">::</span><span class="ruby-constant">HTTP</span>.<span class="ruby-identifier">get</span>( <span class="ruby-constant">URI</span>.<span class="ruby-identifier">parse</span> <span class="ruby-node">"http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml"</span> ).<span class="ruby-identifier">toutf8</span>).<span class="ruby-identifier">elements</span>[<span class="ruby-value str">'api/query/search'</span>] | |
15 | + | |
16 | + <span class="ruby-identifier">res</span>.<span class="ruby-identifier">collect</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">e</span><span class="ruby-operator">|</span> <span class="ruby-identifier">e</span>.<span class="ruby-identifier">attributes</span>[<span class="ruby-value str">'title'</span>] } <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">res</span>.<span class="ruby-identifier">nil?</span> | |
17 | + <span class="ruby-keyword kw">end</span></pre> | |
18 | +</body> | |
19 | +</html> |
doc/classes/Mirimiri/WikipediaPage.src/M000031.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>get_url (Mirimiri::WikipediaPage)</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
8 | +</head> | |
9 | +<body class="standalone-code"> | |
10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 143</span> | |
11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">get_url</span>(<span class="ruby-identifier">name</span>) | |
12 | + <span class="ruby-identifier">raise</span> <span class="ruby-constant">ArgumentError</span>, <span class="ruby-value str">"Bad encoding"</span>, <span class="ruby-identifier">name</span> <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">name</span>.<span class="ruby-identifier">isutf8</span> | |
13 | + | |
14 | + <span class="ruby-identifier">atts</span> = <span class="ruby-constant">REXML</span><span class="ruby-operator">::</span><span class="ruby-constant">Document</span>.<span class="ruby-identifier">new</span>(<span class="ruby-constant">Net</span><span class="ruby-operator">::</span><span class="ruby-constant">HTTP</span>.<span class="ruby-identifier">get</span>( <span class="ruby-constant">URI</span>.<span class="ruby-identifier">parse</span> <span class="ruby-node">"http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml"</span> ).<span class="ruby-identifier">toutf8</span>).<span class="ruby-identifier">elements</span>[<span class="ruby-value str">'api/query/pages/page'</span>].<span class="ruby-identifier">attributes</span> | |
15 | + | |
16 | + <span class="ruby-identifier">atts</span>[<span class="ruby-value str">'fullurl'</span>] <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">atts</span>[<span class="ruby-value str">'missing'</span>].<span class="ruby-identifier">nil?</span> | |
17 | + <span class="ruby-keyword kw">end</span></pre> | |
18 | +</body> | |
19 | +</html> |
doc/classes/Mirimiri/WikipediaPage.src/M000032.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>search_homepage (Mirimiri::WikipediaPage)</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
8 | +</head> | |
9 | +<body class="standalone-code"> | |
10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 151</span> | |
11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">search_homepage</span>(<span class="ruby-identifier">name</span>) | |
12 | + <span class="ruby-identifier">title</span> = <span class="ruby-constant">WikipediaPage</span>.<span class="ruby-identifier">search_wikipedia_titles</span> <span class="ruby-identifier">name</span> | |
13 | + | |
14 | + <span class="ruby-constant">WikipediaPage</span>.<span class="ruby-identifier">new</span>(<span class="ruby-constant">WikipediaPage</span>.<span class="ruby-identifier">get_url</span> <span class="ruby-identifier">title</span>[<span class="ruby-value">0</span>]) <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">title</span>.<span class="ruby-identifier">nil?</span> <span class="ruby-operator">||</span> <span class="ruby-identifier">title</span>.<span class="ruby-identifier">empty?</span> | |
15 | + <span class="ruby-keyword kw">end</span></pre> | |
16 | +</body> | |
17 | +</html> |
doc/files/lib/mirimiri/corpus_rb.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>File: corpus.rb [RDoc Documentation]</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
8 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
9 | + <script type="text/javascript"> | |
10 | + // <![CDATA[ | |
11 | + | |
12 | + function popupCode( url ) { | |
13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
14 | + } | |
15 | + | |
16 | + function toggleCode( id ) { | |
17 | + if ( document.getElementById ) | |
18 | + elem = document.getElementById( id ); | |
19 | + else if ( document.all ) | |
20 | + elem = eval( "document.all." + id ); | |
21 | + else | |
22 | + return false; | |
23 | + | |
24 | + elemStyle = elem.style; | |
25 | + | |
26 | + if ( elemStyle.display != "block" ) { | |
27 | + elemStyle.display = "block" | |
28 | + } else { | |
29 | + elemStyle.display = "none" | |
30 | + } | |
31 | + | |
32 | + return true; | |
33 | + } | |
34 | + | |
35 | + // Make codeblocks hidden by default | |
36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
37 | + | |
38 | + // ]]> | |
39 | + </script> | |
40 | + | |
41 | +</head> | |
42 | +<body> | |
43 | + | |
44 | + | |
45 | + <div id="fileHeader"> | |
46 | + <h1>corpus.rb</h1> | |
47 | + <table class="header-table"> | |
48 | + <tr class="top-aligned-row"> | |
49 | + <td><strong>Path:</strong></td> | |
50 | + <td>lib/mirimiri/corpus.rb | |
51 | + | |
52 | + </td> | |
53 | + </tr> | |
54 | + <tr class="top-aligned-row"> | |
55 | + <td><strong>Last Update:</strong></td> | |
56 | + <td>2010-12-20 10:35:26 +0100</td> | |
57 | + </tr> | |
58 | + </table> | |
59 | + </div> | |
60 | + <!-- banner header --> | |
61 | + | |
62 | + <div id="bodyContent"> | |
63 | + | |
64 | + <div id="contextContent"> | |
65 | + | |
66 | + </div> | |
67 | + | |
68 | + | |
69 | + </div> | |
70 | + | |
71 | + <!-- if includes --> | |
72 | + | |
73 | + <div id="section"> | |
74 | + | |
75 | + | |
76 | + | |
77 | + | |
78 | + <!-- if method_list --> | |
79 | + | |
80 | + | |
81 | + | |
82 | + | |
83 | + </div> | |
84 | + | |
85 | +<div id="validator-badges"> | |
86 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
87 | +</div> | |
88 | + | |
89 | +</body> | |
90 | +</html> |
doc/files/lib/mirimiri/document_rb.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>File: document.rb [RDoc Documentation]</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
8 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
9 | + <script type="text/javascript"> | |
10 | + // <![CDATA[ | |
11 | + | |
12 | + function popupCode( url ) { | |
13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
14 | + } | |
15 | + | |
16 | + function toggleCode( id ) { | |
17 | + if ( document.getElementById ) | |
18 | + elem = document.getElementById( id ); | |
19 | + else if ( document.all ) | |
20 | + elem = eval( "document.all." + id ); | |
21 | + else | |
22 | + return false; | |
23 | + | |
24 | + elemStyle = elem.style; | |
25 | + | |
26 | + if ( elemStyle.display != "block" ) { | |
27 | + elemStyle.display = "block" | |
28 | + } else { | |
29 | + elemStyle.display = "none" | |
30 | + } | |
31 | + | |
32 | + return true; | |
33 | + } | |
34 | + | |
35 | + // Make codeblocks hidden by default | |
36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
37 | + | |
38 | + // ]]> | |
39 | + </script> | |
40 | + | |
41 | +</head> | |
42 | +<body> | |
43 | + | |
44 | + | |
45 | + <div id="fileHeader"> | |
46 | + <h1>document.rb</h1> | |
47 | + <table class="header-table"> | |
48 | + <tr class="top-aligned-row"> | |
49 | + <td><strong>Path:</strong></td> | |
50 | + <td>lib/mirimiri/document.rb | |
51 | + | |
52 | + </td> | |
53 | + </tr> | |
54 | + <tr class="top-aligned-row"> | |
55 | + <td><strong>Last Update:</strong></td> | |
56 | + <td>2010-12-20 10:36:07 +0100</td> | |
57 | + </tr> | |
58 | + </table> | |
59 | + </div> | |
60 | + <!-- banner header --> | |
61 | + | |
62 | + <div id="bodyContent"> | |
63 | + | |
64 | + <div id="contextContent"> | |
65 | + | |
66 | + <div id="requires-list"> | |
67 | + <h3 class="section-bar">Required files</h3> | |
68 | + | |
69 | + <div class="name-list"> | |
70 | + | |
71 | + net/http | |
72 | + | |
73 | + rexml/document | |
74 | + | |
75 | + net/http | |
76 | + | |
77 | + kconv | |
78 | + | |
79 | + </div> | |
80 | + </div> | |
81 | + | |
82 | + </div> | |
83 | + | |
84 | + | |
85 | + </div> | |
86 | + | |
87 | + <!-- if includes --> | |
88 | + | |
89 | + <div id="section"> | |
90 | + | |
91 | + | |
92 | + | |
93 | + | |
94 | + <!-- if method_list --> | |
95 | + | |
96 | + | |
97 | + | |
98 | + | |
99 | + </div> | |
100 | + | |
101 | +<div id="validator-badges"> | |
102 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
103 | +</div> | |
104 | + | |
105 | +</body> | |
106 | +</html> |
doc/files/lib/mirimiri/query_rb.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>File: query.rb [RDoc Documentation]</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
8 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
9 | + <script type="text/javascript"> | |
10 | + // <![CDATA[ | |
11 | + | |
12 | + function popupCode( url ) { | |
13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
14 | + } | |
15 | + | |
16 | + function toggleCode( id ) { | |
17 | + if ( document.getElementById ) | |
18 | + elem = document.getElementById( id ); | |
19 | + else if ( document.all ) | |
20 | + elem = eval( "document.all." + id ); | |
21 | + else | |
22 | + return false; | |
23 | + | |
24 | + elemStyle = elem.style; | |
25 | + | |
26 | + if ( elemStyle.display != "block" ) { | |
27 | + elemStyle.display = "block" | |
28 | + } else { | |
29 | + elemStyle.display = "none" | |
30 | + } | |
31 | + | |
32 | + return true; | |
33 | + } | |
34 | + | |
35 | + // Make codeblocks hidden by default | |
36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
37 | + | |
38 | + // ]]> | |
39 | + </script> | |
40 | + | |
41 | +</head> | |
42 | +<body> | |
43 | + | |
44 | + | |
45 | + <div id="fileHeader"> | |
46 | + <h1>query.rb</h1> | |
47 | + <table class="header-table"> | |
48 | + <tr class="top-aligned-row"> | |
49 | + <td><strong>Path:</strong></td> | |
50 | + <td>lib/mirimiri/query.rb | |
51 | + | |
52 | + </td> | |
53 | + </tr> | |
54 | + <tr class="top-aligned-row"> | |
55 | + <td><strong>Last Update:</strong></td> | |
56 | + <td>2010-12-20 10:36:27 +0100</td> | |
57 | + </tr> | |
58 | + </table> | |
59 | + </div> | |
60 | + <!-- banner header --> | |
61 | + | |
62 | + <div id="bodyContent"> | |
63 | + | |
64 | + <div id="contextContent"> | |
65 | + | |
66 | + </div> | |
67 | + | |
68 | + | |
69 | + </div> | |
70 | + | |
71 | + <!-- if includes --> | |
72 | + | |
73 | + <div id="section"> | |
74 | + | |
75 | + | |
76 | + | |
77 | + | |
78 | + <!-- if method_list --> | |
79 | + | |
80 | + | |
81 | + | |
82 | + | |
83 | + </div> | |
84 | + | |
85 | +<div id="validator-badges"> | |
86 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
87 | +</div> | |
88 | + | |
89 | +</body> | |
90 | +</html> |
doc/files/lib/mirimiri/regexp_rb.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>File: regexp.rb [RDoc Documentation]</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
8 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
9 | + <script type="text/javascript"> | |
10 | + // <![CDATA[ | |
11 | + | |
12 | + function popupCode( url ) { | |
13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
14 | + } | |
15 | + | |
16 | + function toggleCode( id ) { | |
17 | + if ( document.getElementById ) | |
18 | + elem = document.getElementById( id ); | |
19 | + else if ( document.all ) | |
20 | + elem = eval( "document.all." + id ); | |
21 | + else | |
22 | + return false; | |
23 | + | |
24 | + elemStyle = elem.style; | |
25 | + | |
26 | + if ( elemStyle.display != "block" ) { | |
27 | + elemStyle.display = "block" | |
28 | + } else { | |
29 | + elemStyle.display = "none" | |
30 | + } | |
31 | + | |
32 | + return true; | |
33 | + } | |
34 | + | |
35 | + // Make codeblocks hidden by default | |
36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
37 | + | |
38 | + // ]]> | |
39 | + </script> | |
40 | + | |
41 | +</head> | |
42 | +<body> | |
43 | + | |
44 | + | |
45 | + <div id="fileHeader"> | |
46 | + <h1>regexp.rb</h1> | |
47 | + <table class="header-table"> | |
48 | + <tr class="top-aligned-row"> | |
49 | + <td><strong>Path:</strong></td> | |
50 | + <td>lib/mirimiri/regexp.rb | |
51 | + | |
52 | + </td> | |
53 | + </tr> | |
54 | + <tr class="top-aligned-row"> | |
55 | + <td><strong>Last Update:</strong></td> | |
56 | + <td>2010-12-20 10:36:42 +0100</td> | |
57 | + </tr> | |
58 | + </table> | |
59 | + </div> | |
60 | + <!-- banner header --> | |
61 | + | |
62 | + <div id="bodyContent"> | |
63 | + | |
64 | + <div id="contextContent"> | |
65 | + | |
66 | + </div> | |
67 | + | |
68 | + | |
69 | + </div> | |
70 | + | |
71 | + <!-- if includes --> | |
72 | + | |
73 | + <div id="section"> | |
74 | + | |
75 | + | |
76 | + | |
77 | + | |
78 | + <!-- if method_list --> | |
79 | + | |
80 | + | |
81 | + | |
82 | + | |
83 | + </div> | |
84 | + | |
85 | +<div id="validator-badges"> | |
86 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
87 | +</div> | |
88 | + | |
89 | +</body> | |
90 | +</html> |
doc/files/lib/mirimiri/string_rb.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>File: string.rb [RDoc Documentation]</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
8 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
9 | + <script type="text/javascript"> | |
10 | + // <![CDATA[ | |
11 | + | |
12 | + function popupCode( url ) { | |
13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
14 | + } | |
15 | + | |
16 | + function toggleCode( id ) { | |
17 | + if ( document.getElementById ) | |
18 | + elem = document.getElementById( id ); | |
19 | + else if ( document.all ) | |
20 | + elem = eval( "document.all." + id ); | |
21 | + else | |
22 | + return false; | |
23 | + | |
24 | + elemStyle = elem.style; | |
25 | + | |
26 | + if ( elemStyle.display != "block" ) { | |
27 | + elemStyle.display = "block" | |
28 | + } else { | |
29 | + elemStyle.display = "none" | |
30 | + } | |
31 | + | |
32 | + return true; | |
33 | + } | |
34 | + | |
35 | + // Make codeblocks hidden by default | |
36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
37 | + | |
38 | + // ]]> | |
39 | + </script> | |
40 | + | |
41 | +</head> | |
42 | +<body> | |
43 | + | |
44 | + | |
45 | + <div id="fileHeader"> | |
46 | + <h1>string.rb</h1> | |
47 | + <table class="header-table"> | |
48 | + <tr class="top-aligned-row"> | |
49 | + <td><strong>Path:</strong></td> | |
50 | + <td>lib/mirimiri/string.rb | |
51 | + | |
52 | + </td> | |
53 | + </tr> | |
54 | + <tr class="top-aligned-row"> | |
55 | + <td><strong>Last Update:</strong></td> | |
56 | + <td>2010-12-20 10:37:16 +0100</td> | |
57 | + </tr> | |
58 | + </table> | |
59 | + </div> | |
60 | + <!-- banner header --> | |
61 | + | |
62 | + <div id="bodyContent"> | |
63 | + | |
64 | + <div id="contextContent"> | |
65 | + | |
66 | + <div id="description"> | |
67 | + <hr size="1"></hr><p> | |
68 | +General module | |
69 | +</p> | |
70 | + | |
71 | + </div> | |
72 | + | |
73 | + <div id="requires-list"> | |
74 | + <h3 class="section-bar">Required files</h3> | |
75 | + | |
76 | + <div class="name-list"> | |
77 | + | |
78 | + cgi | |
79 | + | |
80 | + kconv | |
81 | + | |
82 | + </div> | |
83 | + </div> | |
84 | + | |
85 | + </div> | |
86 | + | |
87 | + | |
88 | + </div> | |
89 | + | |
90 | + <!-- if includes --> | |
91 | + | |
92 | + <div id="section"> | |
93 | + | |
94 | + | |
95 | + | |
96 | + | |
97 | + <!-- if method_list --> | |
98 | + | |
99 | + | |
100 | + | |
101 | + | |
102 | + </div> | |
103 | + | |
104 | +<div id="validator-badges"> | |
105 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
106 | +</div> | |
107 | + | |
108 | +</body> | |
109 | +</html> |
doc/files/lib/mirimiri/ttagger_rb.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>File: ttagger.rb [RDoc Documentation]</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
8 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
9 | + <script type="text/javascript"> | |
10 | + // <![CDATA[ | |
11 | + | |
12 | + function popupCode( url ) { | |
13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
14 | + } | |
15 | + | |
16 | + function toggleCode( id ) { | |
17 | + if ( document.getElementById ) | |
18 | + elem = document.getElementById( id ); | |
19 | + else if ( document.all ) | |
20 | + elem = eval( "document.all." + id ); | |
21 | + else | |
22 | + return false; | |
23 | + | |
24 | + elemStyle = elem.style; | |
25 | + | |
26 | + if ( elemStyle.display != "block" ) { | |
27 | + elemStyle.display = "block" | |
28 | + } else { | |
29 | + elemStyle.display = "none" | |
30 | + } | |
31 | + | |
32 | + return true; | |
33 | + } | |
34 | + | |
35 | + // Make codeblocks hidden by default | |
36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
37 | + | |
38 | + // ]]> | |
39 | + </script> | |
40 | + | |
41 | +</head> | |
42 | +<body> | |
43 | + | |
44 | + | |
45 | + <div id="fileHeader"> | |
46 | + <h1>ttagger.rb</h1> | |
47 | + <table class="header-table"> | |
48 | + <tr class="top-aligned-row"> | |
49 | + <td><strong>Path:</strong></td> | |
50 | + <td>lib/mirimiri/ttagger.rb | |
51 | + | |
52 | + </td> | |
53 | + </tr> | |
54 | + <tr class="top-aligned-row"> | |
55 | + <td><strong>Last Update:</strong></td> | |
56 | + <td>2010-12-20 10:37:32 +0100</td> | |
57 | + </tr> | |
58 | + </table> | |
59 | + </div> | |
60 | + <!-- banner header --> | |
61 | + | |
62 | + <div id="bodyContent"> | |
63 | + | |
64 | + <div id="contextContent"> | |
65 | + | |
66 | + </div> | |
67 | + | |
68 | + | |
69 | + </div> | |
70 | + | |
71 | + <!-- if includes --> | |
72 | + | |
73 | + <div id="section"> | |
74 | + | |
75 | + | |
76 | + | |
77 | + | |
78 | + <!-- if method_list --> | |
79 | + | |
80 | + | |
81 | + | |
82 | + | |
83 | + </div> | |
84 | + | |
85 | +<div id="validator-badges"> | |
86 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
87 | +</div> | |
88 | + | |
89 | +</body> | |
90 | +</html> |
doc/files/lib/mirimiri_rb.html
1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
4 | +<head> | |
5 | + <title>File: mirimiri.rb [RDoc Documentation]</title> | |
6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
8 | + <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" /> | |
9 | + <script type="text/javascript"> | |
10 | + // <![CDATA[ | |
11 | + | |
12 | + function popupCode( url ) { | |
13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
14 | + } | |
15 | + | |
16 | + function toggleCode( id ) { | |
17 | + if ( document.getElementById ) | |
18 | + elem = document.getElementById( id ); | |
19 | + else if ( document.all ) | |
20 | + elem = eval( "document.all." + id ); | |
21 | + else | |
22 | + return false; | |
23 | + | |
24 | + elemStyle = elem.style; | |
25 | + | |
26 | + if ( elemStyle.display != "block" ) { | |
27 | + elemStyle.display = "block" | |
28 | + } else { | |
29 | + elemStyle.display = "none" | |
30 | + } | |
31 | + | |
32 | + return true; | |
33 | + } | |
34 | + | |
35 | + // Make codeblocks hidden by default | |
36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
37 | + | |
38 | + // ]]> | |
39 | + </script> | |
40 | + | |
41 | +</head> | |
42 | +<body> | |
43 | + | |
44 | + | |
45 | + <div id="fileHeader"> | |
46 | + <h1>mirimiri.rb</h1> | |
47 | + <table class="header-table"> | |
48 | + <tr class="top-aligned-row"> | |
49 | + <td><strong>Path:</strong></td> | |
50 | + <td>lib/mirimiri.rb | |
51 | + | |
52 | + </td> | |
53 | + </tr> | |
54 | + <tr class="top-aligned-row"> | |
55 | + <td><strong>Last Update:</strong></td> | |
56 | + <td>2010-12-20 10:33:51 +0100</td> | |
57 | + </tr> | |
58 | + </table> | |
59 | + </div> | |
60 | + <!-- banner header --> | |
61 | + | |
62 | + <div id="bodyContent"> | |
63 | + | |
64 | + <div id="contextContent"> | |
65 | + | |
66 | + <div id="requires-list"> | |
67 | + <h3 class="section-bar">Required files</h3> | |
68 | + | |
69 | + <div class="name-list"> | |
70 | + | |
71 | + rir/document | |
72 | + | |
73 | + rir/string | |
74 | + | |
75 | + rir/query | |
76 | + | |
77 | + rir/corpus | |
78 | + | |
79 | + rir/regexp | |
80 | + | |
81 | + rir/ttagger | |
82 | + | |
83 | + </div> | |
84 | + </div> | |
85 | + | |
86 | + </div> | |
87 | + | |
88 | + | |
89 | + </div> | |
90 | + | |
91 | + <!-- if includes --> | |
92 | + | |
93 | + <div id="section"> | |
94 | + | |
95 | + | |
96 | + | |
97 | + | |
98 | + <!-- if method_list --> | |
99 | + | |
100 | + | |
101 | + | |
102 | + | |
103 | + </div> | |
104 | + | |
105 | +<div id="validator-badges"> | |
106 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
107 | +</div> | |
108 | + | |
109 | +</body> | |
110 | +</html> |
lib/mirimiri.rb
lib/mirimiri/corpus.rb
1 | +#!/usr/bin/env ruby | |
2 | + | |
3 | +#-- | |
4 | +# This file is a part of the mirimiri library | |
5 | +# | |
6 | +# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | |
7 | +# | |
8 | +# This program is free software: you can redistribute it and/or modify | |
9 | +# it under the terms of the GNU General Public License as published by | |
10 | +# the Free Software Foundation, either version 3 of the License, or | |
11 | +# (at your option) any later version. | |
12 | +# | |
13 | +# This program is distributed in the hope that it will be useful, | |
14 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | +# GNU General Public License for more details. | |
17 | +# | |
18 | +# You should have received a copy of the GNU General Public License | |
19 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +#++ | |
21 | + | |
22 | +class Corpus | |
23 | + attr_accessor :path | |
24 | + | |
25 | + def initialize(path) | |
26 | + @path = path.chomp "/" | |
27 | + end | |
28 | + | |
29 | + # Recursively outputs all files in +self.path+. | |
30 | + # WARNING ! This function may take a lot of time if many | |
31 | + # files are in subdirectories. | |
32 | + # | |
33 | + # c = Corpus.new "my/path" | |
34 | + # c.files # => ["README.txt", "lib/code.rb"] | |
35 | + def files | |
36 | + Dir["#{@path}/**/*.*"] | |
37 | + end | |
38 | +end |
lib/mirimiri/document.rb
1 | +#!/usr/bin/env ruby | |
2 | + | |
3 | +#-- | |
4 | +# This file is a part of the mirimiri library | |
5 | +# | |
6 | +# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | |
7 | +# | |
8 | +# This program is free software: you can redistribute it and/or modify | |
9 | +# it under the terms of the GNU General Public License as published by | |
10 | +# the Free Software Foundation, either version 3 of the License, or | |
11 | +# (at your option) any later version. | |
12 | +# | |
13 | +# This program is distributed in the hope that it will be useful, | |
14 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | +# GNU General Public License for more details. | |
17 | +# | |
18 | +# You should have received a copy of the GNU General Public License | |
19 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +#++ | |
21 | + | |
22 | + | |
23 | +# General module | |
24 | +module Mirimiri | |
25 | + | |
26 | + # A Document is a bag of words and is constructed from a string. | |
27 | + class Document | |
28 | + attr_reader :words, :doc_content | |
29 | + | |
30 | + # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html | |
31 | + # and the \\W special escape). | |
32 | + # | |
33 | + # Protected function, only meant to by called at the initialization. | |
34 | + def format_words | |
35 | + wo = [] | |
36 | + | |
37 | + @doc_content.split.each do |w| | |
38 | + w.split(/\W/).each do |sw| | |
39 | + wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ | |
40 | + end | |
41 | + end | |
42 | + | |
43 | + wo | |
44 | + end | |
45 | + | |
46 | + # Returns an Array containing the +n+-grams (words) from the current Document. | |
47 | + # | |
48 | + # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] | |
49 | + def ngrams(n) | |
50 | + window = [] | |
51 | + ngrams_array = [] | |
52 | + | |
53 | + @words.each do |w| | |
54 | + window.push(w) | |
55 | + if window.size == n | |
56 | + ngrams_array.push window.join(" ") | |
57 | + window.delete_at(0) | |
58 | + end | |
59 | + end | |
60 | + | |
61 | + ngrams_array.uniq | |
62 | + end | |
63 | + | |
64 | + # Returns a Hash containing the words and their associated counts in the current Document. | |
65 | + # | |
66 | + # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } | |
67 | + def count_words | |
68 | + counts = Hash.new { |h,k| h[k] = 0 } | |
69 | + @words.each { |w| counts[w] += 1 } | |
70 | + | |
71 | + counts | |
72 | + end | |
73 | + | |
74 | + # Computes the entropy of a given string +s+ inside the document. | |
75 | + # | |
76 | + # If the string parameter is composed of many words (i.e. tokens separated | |
77 | + # by whitespace(s)), it is considered as an ngram. | |
78 | + # | |
79 | + # entropy("guitar") #=> 0.00432114812727959 | |
80 | + # entropy("dillinger escape plan") #=> 0.265862076325102 | |
81 | + def entropy(s) | |
82 | + en = 0.0 | |
83 | + counts = self.count_words | |
84 | + | |
85 | + s.split.each do |w| | |
86 | + p_wi = counts[w].to_f/@words.count.to_f | |
87 | + en += p_wi*Math.log2(p_wi) | |
88 | + end | |
89 | + | |
90 | + en *= -1 | |
91 | + en | |
92 | + end | |
93 | + | |
94 | + # Computes the term frequency of a given *word* +s+. | |
95 | + # | |
96 | + # tf("guitar") #=> 0.000380372765310004 | |
97 | + def tf(s) | |
98 | + self.count_words[s].to_f/@words.size.to_f | |
99 | + end | |
100 | + | |
101 | + | |
102 | + def initialize(content) | |
103 | + @doc_content = content | |
104 | + @words = format_words | |
105 | + end | |
106 | + | |
107 | + protected :format_words | |
108 | + end | |
109 | + | |
110 | + # A WebDocument is a Document with a +url+. | |
111 | + class WebDocument < Document | |
112 | + attr_reader :url | |
113 | + | |
114 | + # Returns the HTML text from the page of a given +url+. | |
115 | + def self.get_content(url) | |
116 | + require 'net/http' | |
117 | + Net::HTTP.get(URI.parse(url)) | |
118 | + end | |
119 | + | |
120 | + # WebDocument constructor, the content of the Document is the HTML page | |
121 | + # without the tags. | |
122 | + def initialize(url) | |
123 | + @url = url | |
124 | + super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags | |
125 | + end | |
126 | + end | |
127 | + | |
128 | + # A WikipediaPage is a WebDocument. | |
129 | + class WikipediaPage < WebDocument | |
130 | + require 'rexml/document' | |
131 | + require 'net/http' | |
132 | + require 'kconv' | |
133 | + | |
134 | + | |
135 | + def self.search_wikipedia_titles(name) | |
136 | + raise ArgumentError, "Bad encoding", name unless name.isutf8 | |
137 | + | |
138 | + res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search'] | |
139 | + | |
140 | + res.collect { |e| e.attributes['title'] } unless res.nil? | |
141 | + end | |
142 | + | |
143 | + def self.get_url(name) | |
144 | + raise ArgumentError, "Bad encoding", name unless name.isutf8 | |
145 | + | |
146 | + atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes | |
147 | + | |
148 | + atts['fullurl'] if atts['missing'].nil? | |
149 | + end | |
150 | + | |
151 | + def self.search_homepage(name) | |
152 | + title = WikipediaPage.search_wikipedia_titles name | |
153 | + | |
154 | + WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? | |
155 | + end | |
156 | + | |
157 | +# def initialize(name) | |
158 | +# title = WikipediaPage.search_wikipedia_titles name | |
159 | +# raise ArgumentError, "No page found" if title.empty? | |
160 | +# super WikipediaPage.get_url title[0] | |
161 | +# end | |
162 | + end | |
163 | +end |
lib/mirimiri/query.rb
1 | +#!/usr/bin/env ruby | |
2 | + | |
3 | +#-- | |
4 | +# This file is a part of the mirimiri library | |
5 | +# | |
6 | +# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | |
7 | +# | |
8 | +# This program is free software: you can redistribute it and/or modify | |
9 | +# it under the terms of the GNU General Public License as published by | |
10 | +# the Free Software Foundation, either version 3 of the License, or | |
11 | +# (at your option) any later version. | |
12 | +# | |
13 | +# This program is distributed in the hope that it will be useful, | |
14 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | +# GNU General Public License for more details. | |
17 | +# | |
18 | +# You should have received a copy of the GNU General Public License | |
19 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +#++ | |
21 | + | |
22 | +class Query | |
23 | +end | |
24 | + | |
25 | +module Indri | |
26 | + | |
27 | + class Parameters | |
28 | + attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline | |
29 | + | |
30 | + def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false) | |
31 | + @index_path = corpus | |
32 | + @memory = mem | |
33 | + @count = count | |
34 | + @offset = offset | |
35 | + @run_id = run_id | |
36 | + @print_query = print_query ? "true" : "false" | |
37 | + @print_docs = print_docs ? "true" : "false" | |
38 | + end | |
39 | + | |
40 | + def to_s | |
41 | + h = "<parameters>\n" | |
42 | + h += "<memory>#{@memory}</memory>\n" | |
43 | + h += "<index>#{@index_path}</index>\n" | |
44 | + h += "<count>#{@count}</count>\n" | |
45 | + unless @baseline.nil? | |
46 | + h += "<baseline>#{@baseline}</baseline>\n" | |
47 | + else | |
48 | + h += "<rule>#{@rule}</rule>\n" | |
49 | + end | |
50 | + h += "<queryOffset>#{@offset}</queryOffset>\n" | |
51 | + h += "<runID>#{@run_id}</runID>\n" | |
52 | + h += "<printQuery>#{@print_query}</printQuery>\n" | |
53 | + h += "<printDocuments>#{@print_docs}</printDocuments>\n" | |
54 | + | |
55 | + h | |
56 | + end | |
57 | + end | |
58 | + | |
59 | + class IndriQuery < Query | |
60 | + attr_accessor :id, :query, :params, :rule | |
61 | + | |
62 | + def initialize(id,query,params) | |
63 | + @params = params | |
64 | + # Here we set the default retrieval model as Language Modeling | |
65 | + # with a Dirichlet smoothing at 2500. | |
66 | + # TODO: maybe a Rule class... | |
67 | + @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? | |
68 | + | |
69 | + @id = id | |
70 | + @query = query | |
71 | + end | |
72 | + | |
73 | + def to_s | |
74 | + h = @params.to_s | |
75 | + h += "<query>\n" | |
76 | + h += "<number>#{@id}</number>\n" | |
77 | + h += "<text>#{@query}</text>\n" | |
78 | + h += "</query>\n" | |
79 | + h += "</parameters>" | |
80 | + | |
81 | + h | |
82 | + end | |
83 | + end | |
84 | + | |
85 | +end |
lib/mirimiri/regexp.rb
1 | +#!/usr/bin/env ruby | |
2 | + | |
3 | +#-- | |
4 | +# This file is a part of the mirimiri library | |
5 | +# | |
6 | +# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | |
7 | +# | |
8 | +# This program is free software: you can redistribute it and/or modify | |
9 | +# it under the terms of the GNU General Public License as published by | |
10 | +# the Free Software Foundation, either version 3 of the License, or | |
11 | +# (at your option) any later version. | |
12 | +# | |
13 | +# This program is distributed in the hope that it will be useful, | |
14 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | +# GNU General Public License for more details. | |
17 | +# | |
18 | +# You should have received a copy of the GNU General Public License | |
19 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +#++ | |
21 | + | |
22 | +class Regexp | |
23 | + | |
24 | + def negated | |
25 | + /^((?!#{self}).)*$/ | |
26 | + end | |
27 | + | |
28 | +end |
lib/mirimiri/string.rb
1 | +#!/usr/bin/env ruby | |
2 | + | |
3 | +#-- | |
4 | +# This file is a part of the mirimiri library | |
5 | +# | |
6 | +# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | |
7 | +# | |
8 | +# This program is free software: you can redistribute it and/or modify | |
9 | +# it under the terms of the GNU General Public License as published by | |
10 | +# the Free Software Foundation, either version 3 of the License, or | |
11 | +# (at your option) any later version. | |
12 | +# | |
13 | +# This program is distributed in the hope that it will be useful, | |
14 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | +# GNU General Public License for more details. | |
17 | +# | |
18 | +# You should have received a copy of the GNU General Public License | |
19 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +#++ | |
21 | + | |
22 | +module Mirimiri | |
23 | + | |
24 | + # These are the default stopwords provided by Lemur. | |
25 | + Stoplist = [ | |
26 | + "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av", | |
27 | + "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", | |
28 | + "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", | |
29 | + "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu", | |
30 | + "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during", | |
31 | + "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every", | |
32 | + "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting", | |
33 | + "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff", | |
34 | + "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore", | |
35 | + "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he", | |
36 | + "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto", | |
37 | + "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto", | |
38 | + "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include", | |
39 | + "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into", | |
40 | + "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last", | |
41 | + "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe", | |
42 | + "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs", | |
43 | + "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless", | |
44 | + "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing", | |
45 | + "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once", | |
46 | + "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", | |
47 | + "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite", | |
48 | + "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing", | |
49 | + "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt", | |
50 | + "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote", | |
51 | + "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", | |
52 | + "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave", | |
53 | + "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them", | |
54 | + "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts", | |
55 | + "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", | |
56 | + "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru", | |
57 | + "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh", | |
58 | + "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward", | |
59 | + "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week", | |
60 | + "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", | |
61 | + "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore", | |
62 | + "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", | |
63 | + "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever", | |
64 | + "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom", | |
65 | + "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within", | |
66 | + "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your", | |
67 | + "yours", "yourself", "yourselves" | |
68 | + ] | |
69 | + | |
70 | + | |
71 | +end | |
72 | + | |
73 | +# Extention of the standard class String with useful function. | |
74 | +class String | |
75 | + include Mirimiri | |
76 | + | |
77 | + # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise. | |
78 | + def is_stopword? | |
79 | + Stoplist.include?(self.downcase) | |
80 | + end | |
81 | + | |
82 | + # Do not use. | |
83 | + # TODO: rewamp. find why this function is here. | |
84 | + def remove_special_characters | |
85 | + self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ') | |
86 | + end | |
87 | + | |
88 | + # Removes all XML-like tags from +self+. | |
89 | + # | |
90 | + # s = "<html><body>test</body></html>" | |
91 | + # s.strip_xml_tags! | |
92 | + # s #=> "test" | |
93 | + def strip_xml_tags! | |
94 | + replace strip_with_pattern /<\/?[^>]*>/ | |
95 | + end | |
96 | + | |
97 | + # Removes all XML-like tags from +self+. | |
98 | + # | |
99 | + # s = "<html><body>test</body></html>" | |
100 | + # s.strip_xml_tags #=> "test" | |
101 | + # s #=> "<html><body>test</body></html>" | |
102 | + def strip_xml_tags | |
103 | + dup.strip_xml_tags! | |
104 | + end | |
105 | + | |
106 | + # Removes all Javascript sources from +self+. | |
107 | + # | |
108 | + # s = "<script type='text/javascript'> | |
109 | + # var skin='vector', | |
110 | + # stylepath='http://bits.wikimedia.org/skins-1.5' | |
111 | + # </script> | |
112 | + # | |
113 | + # test" | |
114 | + # s.strip_javascripts! | |
115 | + # s #=> "test" | |
116 | + def strip_javascripts! | |
117 | + replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m | |
118 | + end | |
119 | + | |
120 | + # Removes all Javascript sources from +self+. | |
121 | + # | |
122 | + # s = "<script type='text/javascript'> | |
123 | + # var skin='vector', | |
124 | + # stylepath='http://bits.wikimedia.org/skins-1.5' | |
125 | + # </script> | |
126 | + # | |
127 | + # test" | |
128 | + # s.strip_javascripts #=> "test" | |
129 | + def strip_javascripts | |
130 | + dup.strip_javascripts! | |
131 | + end | |
132 | + | |
133 | + def strip_stylesheets! | |
134 | + # TODO: rewamp. dunno what is it. | |
135 | + replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m | |
136 | + end | |
137 | + | |
138 | + def strip_stylesheets | |
139 | + dup.strip_stylesheets! | |
140 | + end | |
141 | + | |
142 | + # Removes punctuation from +self+. | |
143 | + # | |
144 | + # s = "hello, world. how are you?!" | |
145 | + # s.strip_punctuation! | |
146 | + # s # => "hello world how are you" | |
147 | + def strip_punctuation! | |
148 | + replace strip_with_pattern /[^a-zA-Z0-9\-\s]/ | |
149 | + end | |
150 | + | |
151 | + # Removes punctuation from +self+. | |
152 | + # | |
153 | + # s = "hello, world. how are you?!" | |
154 | + # s.strip_punctuation # => "hello world how are you" | |
155 | + def strip_punctuation | |
156 | + dup.strip_punctuation! | |
157 | + end | |
158 | + | |
159 | + # Returns the text values inside all occurences of a XML tag in +self+ | |
160 | + # | |
161 | + # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre" | |
162 | + # s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"] | |
163 | + def extract_xmltags_values(tag_name) | |
164 | + self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten | |
165 | + end | |
166 | + | |
167 | + def strip_with_pattern(pattern) | |
168 | + require 'cgi' | |
169 | + require 'kconv' | |
170 | + CGI::unescapeHTML(self.gsub(pattern,"")).toutf8 | |
171 | + end | |
172 | + | |
173 | + private :strip_with_pattern | |
174 | +end |
lib/mirimiri/ttagger.rb
1 | +#!/usr/bin/env ruby | |
2 | + | |
3 | +#-- | |
4 | +# This file is a part of the mirimiri library | |
5 | +# | |
6 | +# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | |
7 | +# | |
8 | +# This program is free software: you can redistribute it and/or modify | |
9 | +# it under the terms of the GNU General Public License as published by | |
10 | +# the Free Software Foundation, either version 3 of the License, or | |
11 | +# (at your option) any later version. | |
12 | +# | |
13 | +# This program is distributed in the hope that it will be useful, | |
14 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | +# GNU General Public License for more details. | |
17 | +# | |
18 | +# You should have received a copy of the GNU General Public License | |
19 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | +#++ | |
21 | + | |
22 | + | |
23 | +# TreeTagger-related stuff module. | |
24 | +# | |
25 | +# See http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/DecisionTreeTagger.html | |
26 | +module TreeTagger | |
27 | + | |
28 | + # This class handles generic parsing of tagger-chunker outputs. | |
29 | + class TaggerChunker | |
30 | + attr_reader :chunks, :file | |
31 | + | |
32 | + | |
33 | + # Parses a tagger-chunker output and returns an Array of Chunk. | |
34 | + def self.parse chunk_lines | |
35 | + open = false | |
36 | + tag = nil | |
37 | + | |
38 | + chunks = [] | |
39 | + words = [] | |
40 | + | |
41 | + chunk_lines.each do |l| | |
42 | + l.chomp! | |
43 | + if l =~ /^<\w+>$/ | |
44 | + open = true | |
45 | + tag = l | |
46 | + elsif l =~ /^<\/\w+>$/ | |
47 | + if !words.empty? && open && l == tag.sub(/</, '</') | |
48 | + open = false | |
49 | + chunks.push Chunk.new(words.join(" "), tag) | |
50 | + words.clear | |
51 | + else | |
52 | + next | |
53 | + end | |
54 | + else | |
55 | + words.push(l.split.first) | |
56 | + end | |
57 | + end | |
58 | + | |
59 | + chunks | |
60 | + end | |
61 | + | |
62 | + # Initializes parsing. +chunk_file+ is the output of +tagger-chunker-+ and must | |
63 | + # be a valid path to the file. | |
64 | + # | |
65 | + # TaggerChunker.new("ttout/2010020") #=> #<RIR::TreeTagger::TaggerChunker:0x92fd088 @chunks=[#<RIR::TreeTagger::Chunk:0x8ec5a10 @words=["robert", "schumann"], @tag="NC">, ...] ...> | |
66 | + def initialize chunk_file | |
67 | + @chunks = TaggerChunker.parse File.open(chunk_file).readlines | |
68 | + end | |
69 | + | |
70 | + end | |
71 | + | |
72 | + class TaggerChunkerEnglish < TaggerChunker | |
73 | + end | |
74 | + | |
75 | + class TaggerChunkerFrench < TaggerChunker | |
76 | + end | |
77 | + | |
78 | + class TaggerChunkerGerman < TaggerChunker | |
79 | + end | |
80 | + | |
81 | + # Represents a Chunk extracted when parsing a TaggerChunker file. | |
82 | + class Chunk | |
83 | + attr_reader :words, :tag | |
84 | + | |
85 | + # Creates a Chunk. | |
86 | + # | |
87 | + # * +str+ are whitespace-separated terms. | |
88 | + # * +tag+ see : ftp://ftp.ims.uni-stuttgart.de/pub/corpora/chunker-tagset-english.txt | |
89 | + def initialize str,tag | |
90 | + @words = str.split | |
91 | + @tag = tag[1..-2] | |
92 | + end | |
93 | + end | |
94 | + | |
95 | +end |