Commit cd74322524114fbad147da48f608384d03e46c58
1 parent
b3995017e6
Exists in
master
adding missing files
Showing 29 changed files with 2390 additions and 0 deletions Side-by-side Diff
- doc/classes/Mirimiri.html
- doc/classes/Mirimiri/Document.html
- doc/classes/Mirimiri/Document.src/M000022.html
- doc/classes/Mirimiri/Document.src/M000023.html
- doc/classes/Mirimiri/Document.src/M000024.html
- doc/classes/Mirimiri/Document.src/M000025.html
- doc/classes/Mirimiri/Document.src/M000026.html
- doc/classes/Mirimiri/Document.src/M000027.html
- doc/classes/Mirimiri/WebDocument.html
- doc/classes/Mirimiri/WebDocument.src/M000028.html
- doc/classes/Mirimiri/WebDocument.src/M000029.html
- doc/classes/Mirimiri/WikipediaPage.html
- doc/classes/Mirimiri/WikipediaPage.src/M000030.html
- doc/classes/Mirimiri/WikipediaPage.src/M000031.html
- doc/classes/Mirimiri/WikipediaPage.src/M000032.html
- doc/files/lib/mirimiri/corpus_rb.html
- doc/files/lib/mirimiri/document_rb.html
- doc/files/lib/mirimiri/query_rb.html
- doc/files/lib/mirimiri/regexp_rb.html
- doc/files/lib/mirimiri/string_rb.html
- doc/files/lib/mirimiri/ttagger_rb.html
- doc/files/lib/mirimiri_rb.html
- lib/mirimiri.rb
- lib/mirimiri/corpus.rb
- lib/mirimiri/document.rb
- lib/mirimiri/query.rb
- lib/mirimiri/regexp.rb
- lib/mirimiri/string.rb
- lib/mirimiri/ttagger.rb
doc/classes/Mirimiri.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>Module: Mirimiri [RDoc Documentation]</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
| 8 | + <link rel="stylesheet" href=".././rdoc-style.css" type="text/css" media="screen" /> | |
| 9 | + <script type="text/javascript"> | |
| 10 | + // <![CDATA[ | |
| 11 | + | |
| 12 | + function popupCode( url ) { | |
| 13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
| 14 | + } | |
| 15 | + | |
| 16 | + function toggleCode( id ) { | |
| 17 | + if ( document.getElementById ) | |
| 18 | + elem = document.getElementById( id ); | |
| 19 | + else if ( document.all ) | |
| 20 | + elem = eval( "document.all." + id ); | |
| 21 | + else | |
| 22 | + return false; | |
| 23 | + | |
| 24 | + elemStyle = elem.style; | |
| 25 | + | |
| 26 | + if ( elemStyle.display != "block" ) { | |
| 27 | + elemStyle.display = "block" | |
| 28 | + } else { | |
| 29 | + elemStyle.display = "none" | |
| 30 | + } | |
| 31 | + | |
| 32 | + return true; | |
| 33 | + } | |
| 34 | + | |
| 35 | + // Make codeblocks hidden by default | |
| 36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
| 37 | + | |
| 38 | + // ]]> | |
| 39 | + </script> | |
| 40 | + | |
| 41 | +</head> | |
| 42 | +<body> | |
| 43 | + | |
| 44 | + | |
| 45 | + <div id="classHeader"> | |
| 46 | + <table class="header-table"> | |
| 47 | + <tr class="top-aligned-row"> | |
| 48 | + <td><strong>Module</strong></td> | |
| 49 | + <td class="class-name-in-header">Mirimiri</td> | |
| 50 | + </tr> | |
| 51 | + <tr class="top-aligned-row"> | |
| 52 | + <td><strong>In:</strong></td> | |
| 53 | + <td> | |
| 54 | + | |
| 55 | + | |
| 56 | + <a href="../files/lib/mirimiri/string_rb.html"> | |
| 57 | + | |
| 58 | + lib/mirimiri/string.rb | |
| 59 | + | |
| 60 | + </a> | |
| 61 | + | |
| 62 | + | |
| 63 | + <br /> | |
| 64 | + | |
| 65 | + | |
| 66 | + <a href="../files/lib/mirimiri/document_rb.html"> | |
| 67 | + | |
| 68 | + lib/mirimiri/document.rb | |
| 69 | + | |
| 70 | + </a> | |
| 71 | + | |
| 72 | + | |
| 73 | + <br /> | |
| 74 | + | |
| 75 | + </td> | |
| 76 | + </tr> | |
| 77 | + | |
| 78 | + | |
| 79 | + </table> | |
| 80 | + </div> | |
| 81 | + <!-- banner header --> | |
| 82 | + | |
| 83 | + <div id="bodyContent"> | |
| 84 | + | |
| 85 | + <div id="contextContent"> | |
| 86 | + | |
| 87 | + <div id="description"> | |
| 88 | + <hr size="1"></hr><p> | |
| 89 | +General module | |
| 90 | +</p> | |
| 91 | + | |
| 92 | + </div> | |
| 93 | + | |
| 94 | + </div> | |
| 95 | + | |
| 96 | + | |
| 97 | + </div> | |
| 98 | + | |
| 99 | + <!-- if includes --> | |
| 100 | + | |
| 101 | + <div id="section"> | |
| 102 | + | |
| 103 | + <div id="class-list"> | |
| 104 | + <h3 class="section-bar">Classes and Modules</h3> | |
| 105 | + | |
| 106 | + Class <a href="Mirimiri/Document.html" class="link">Mirimiri::Document</a><br /> | |
| 107 | +Class <a href="Mirimiri/WebDocument.html" class="link">Mirimiri::WebDocument</a><br /> | |
| 108 | +Class <a href="Mirimiri/WikipediaPage.html" class="link">Mirimiri::WikipediaPage</a><br /> | |
| 109 | + | |
| 110 | + </div> | |
| 111 | + | |
| 112 | + <div id="constants-list"> | |
| 113 | + <h3 class="section-bar">Constants</h3> | |
| 114 | + | |
| 115 | + <div class="name-list"> | |
| 116 | + <table summary="Constants"> | |
| 117 | + | |
| 118 | + <tr class="top-aligned-row context-row"> | |
| 119 | + <td class="context-item-name">Stoplist</td> | |
| 120 | + <td>=</td> | |
| 121 | + <td class="context-item-value">[ "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu", "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during", "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting", "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff", "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore", "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he", "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto", "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto", "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include", "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into", "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last", "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe", "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs", "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless", "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing", "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite", "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt", "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote", "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave", "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them", "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts", "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru", "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh", "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward", "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week", "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore", "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever", "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom", "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within", "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your", "yours", "yourself", "yourselves" ]</td> | |
| 122 | + | |
| 123 | + <td> </td> | |
| 124 | + <td class="context-item-desc"> | |
| 125 | +These are the default stopwords provided by Lemur. | |
| 126 | + | |
| 127 | +</td> | |
| 128 | + | |
| 129 | + </tr> | |
| 130 | + | |
| 131 | + </table> | |
| 132 | + </div> | |
| 133 | + </div> | |
| 134 | + | |
| 135 | + | |
| 136 | + | |
| 137 | + | |
| 138 | + <!-- if method_list --> | |
| 139 | + | |
| 140 | + | |
| 141 | + | |
| 142 | + | |
| 143 | + </div> | |
| 144 | + | |
| 145 | +<div id="validator-badges"> | |
| 146 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
| 147 | +</div> | |
| 148 | + | |
| 149 | +</body> | |
| 150 | +</html> |
doc/classes/Mirimiri/Document.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>Class: Mirimiri::Document [RDoc Documentation]</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
| 8 | + <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 9 | + <script type="text/javascript"> | |
| 10 | + // <![CDATA[ | |
| 11 | + | |
| 12 | + function popupCode( url ) { | |
| 13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
| 14 | + } | |
| 15 | + | |
| 16 | + function toggleCode( id ) { | |
| 17 | + if ( document.getElementById ) | |
| 18 | + elem = document.getElementById( id ); | |
| 19 | + else if ( document.all ) | |
| 20 | + elem = eval( "document.all." + id ); | |
| 21 | + else | |
| 22 | + return false; | |
| 23 | + | |
| 24 | + elemStyle = elem.style; | |
| 25 | + | |
| 26 | + if ( elemStyle.display != "block" ) { | |
| 27 | + elemStyle.display = "block" | |
| 28 | + } else { | |
| 29 | + elemStyle.display = "none" | |
| 30 | + } | |
| 31 | + | |
| 32 | + return true; | |
| 33 | + } | |
| 34 | + | |
| 35 | + // Make codeblocks hidden by default | |
| 36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
| 37 | + | |
| 38 | + // ]]> | |
| 39 | + </script> | |
| 40 | + | |
| 41 | +</head> | |
| 42 | +<body> | |
| 43 | + | |
| 44 | + | |
| 45 | + <div id="classHeader"> | |
| 46 | + <table class="header-table"> | |
| 47 | + <tr class="top-aligned-row"> | |
| 48 | + <td><strong>Class</strong></td> | |
| 49 | + <td class="class-name-in-header">Mirimiri::Document</td> | |
| 50 | + </tr> | |
| 51 | + <tr class="top-aligned-row"> | |
| 52 | + <td><strong>In:</strong></td> | |
| 53 | + <td> | |
| 54 | + | |
| 55 | + | |
| 56 | + <a href="../../files/lib/mirimiri/document_rb.html"> | |
| 57 | + | |
| 58 | + lib/mirimiri/document.rb | |
| 59 | + | |
| 60 | + </a> | |
| 61 | + | |
| 62 | + | |
| 63 | + <br /> | |
| 64 | + | |
| 65 | + </td> | |
| 66 | + </tr> | |
| 67 | + | |
| 68 | + | |
| 69 | + <tr class="top-aligned-row"> | |
| 70 | + <td><strong>Parent:</strong></td> | |
| 71 | + <td> | |
| 72 | + | |
| 73 | + Object | |
| 74 | + | |
| 75 | + </td> | |
| 76 | + </tr> | |
| 77 | + | |
| 78 | + </table> | |
| 79 | + </div> | |
| 80 | + <!-- banner header --> | |
| 81 | + | |
| 82 | + <div id="bodyContent"> | |
| 83 | + | |
| 84 | + <div id="contextContent"> | |
| 85 | + | |
| 86 | + <div id="description"> | |
| 87 | + <p> | |
| 88 | +A <a href="Document.html">Document</a> is a bag of words and is constructed | |
| 89 | +from a string. | |
| 90 | +</p> | |
| 91 | + | |
| 92 | + </div> | |
| 93 | + | |
| 94 | + </div> | |
| 95 | + | |
| 96 | + | |
| 97 | + <div id="method-list"> | |
| 98 | + <h3 class="section-bar">Methods</h3> | |
| 99 | + | |
| 100 | + <div class="name-list"> | |
| 101 | + | |
| 102 | + <a href="#M000024">count_words</a> | |
| 103 | + | |
| 104 | + <a href="#M000025">entropy</a> | |
| 105 | + | |
| 106 | + <a href="#M000022">format_words</a> | |
| 107 | + | |
| 108 | + <a href="#M000027">new</a> | |
| 109 | + | |
| 110 | + <a href="#M000023">ngrams</a> | |
| 111 | + | |
| 112 | + <a href="#M000026">tf</a> | |
| 113 | + | |
| 114 | + </div> | |
| 115 | + </div> | |
| 116 | + | |
| 117 | + </div> | |
| 118 | + | |
| 119 | + <!-- if includes --> | |
| 120 | + | |
| 121 | + <div id="section"> | |
| 122 | + | |
| 123 | + | |
| 124 | + | |
| 125 | + <div id="attribute-list"> | |
| 126 | + <h3 class="section-bar">Attributes</h3> | |
| 127 | + | |
| 128 | + <div class="name-list"> | |
| 129 | + <table> | |
| 130 | + | |
| 131 | + <tr class="top-aligned-row context-row"> | |
| 132 | + <td class="context-item-name">doc_content</td> | |
| 133 | + | |
| 134 | + <td class="context-item-value"> [R] </td> | |
| 135 | + | |
| 136 | + <td class="context-item-desc"></td> | |
| 137 | + </tr> | |
| 138 | + | |
| 139 | + <tr class="top-aligned-row context-row"> | |
| 140 | + <td class="context-item-name">words</td> | |
| 141 | + | |
| 142 | + <td class="context-item-value"> [R] </td> | |
| 143 | + | |
| 144 | + <td class="context-item-desc"></td> | |
| 145 | + </tr> | |
| 146 | + | |
| 147 | + </table> | |
| 148 | + </div> | |
| 149 | + </div> | |
| 150 | + | |
| 151 | + | |
| 152 | + <!-- if method_list --> | |
| 153 | + | |
| 154 | + <div id="methods"> | |
| 155 | + | |
| 156 | + <h3 class="section-bar">Public Class methods</h3> | |
| 157 | + | |
| 158 | + | |
| 159 | + <div id="method-M000027" class="method-detail"> | |
| 160 | + <a name="M000027"></a> | |
| 161 | + | |
| 162 | + <div class="method-heading"> | |
| 163 | + | |
| 164 | + <a href="Document.src/M000027.html" target="Code" class="method-signature" | |
| 165 | + onclick="popupCode('Document.src/M000027.html');return false;"> | |
| 166 | + | |
| 167 | + <span class="method-name">new</span><span class="method-args">(content)</span> | |
| 168 | + | |
| 169 | + </a> | |
| 170 | + | |
| 171 | + </div> | |
| 172 | + | |
| 173 | + <div class="method-description"> | |
| 174 | + | |
| 175 | + </div> | |
| 176 | + </div> | |
| 177 | + | |
| 178 | + | |
| 179 | + <h3 class="section-bar">Public Instance methods</h3> | |
| 180 | + | |
| 181 | + | |
| 182 | + <div id="method-M000024" class="method-detail"> | |
| 183 | + <a name="M000024"></a> | |
| 184 | + | |
| 185 | + <div class="method-heading"> | |
| 186 | + | |
| 187 | + <a href="Document.src/M000024.html" target="Code" class="method-signature" | |
| 188 | + onclick="popupCode('Document.src/M000024.html');return false;"> | |
| 189 | + | |
| 190 | + <span class="method-name">count_words</span><span class="method-args">()</span> | |
| 191 | + | |
| 192 | + </a> | |
| 193 | + | |
| 194 | + </div> | |
| 195 | + | |
| 196 | + <div class="method-description"> | |
| 197 | + | |
| 198 | + <p> | |
| 199 | +Returns a Hash containing the words and their associated counts in the | |
| 200 | +current <a href="Document.html">Document</a>. | |
| 201 | +</p> | |
| 202 | +<pre> | |
| 203 | + count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } | |
| 204 | +</pre> | |
| 205 | + | |
| 206 | + </div> | |
| 207 | + </div> | |
| 208 | + | |
| 209 | + | |
| 210 | + <div id="method-M000025" class="method-detail"> | |
| 211 | + <a name="M000025"></a> | |
| 212 | + | |
| 213 | + <div class="method-heading"> | |
| 214 | + | |
| 215 | + <a href="Document.src/M000025.html" target="Code" class="method-signature" | |
| 216 | + onclick="popupCode('Document.src/M000025.html');return false;"> | |
| 217 | + | |
| 218 | + <span class="method-name">entropy</span><span class="method-args">(s)</span> | |
| 219 | + | |
| 220 | + </a> | |
| 221 | + | |
| 222 | + </div> | |
| 223 | + | |
| 224 | + <div class="method-description"> | |
| 225 | + | |
| 226 | + <p> | |
| 227 | +Computes the entropy of a given string <tt>s</tt> inside the document. | |
| 228 | +</p> | |
| 229 | +<p> | |
| 230 | +If the string parameter is composed of many words (i.e. tokens separated by | |
| 231 | +whitespace(s)), it is considered as an ngram. | |
| 232 | +</p> | |
| 233 | +<pre> | |
| 234 | + entropy("guitar") #=> 0.00432114812727959 | |
| 235 | + entropy("dillinger escape plan") #=> 0.265862076325102 | |
| 236 | +</pre> | |
| 237 | + | |
| 238 | + </div> | |
| 239 | + </div> | |
| 240 | + | |
| 241 | + | |
| 242 | + <div id="method-M000023" class="method-detail"> | |
| 243 | + <a name="M000023"></a> | |
| 244 | + | |
| 245 | + <div class="method-heading"> | |
| 246 | + | |
| 247 | + <a href="Document.src/M000023.html" target="Code" class="method-signature" | |
| 248 | + onclick="popupCode('Document.src/M000023.html');return false;"> | |
| 249 | + | |
| 250 | + <span class="method-name">ngrams</span><span class="method-args">(n)</span> | |
| 251 | + | |
| 252 | + </a> | |
| 253 | + | |
| 254 | + </div> | |
| 255 | + | |
| 256 | + <div class="method-description"> | |
| 257 | + | |
| 258 | + <p> | |
| 259 | +Returns an Array containing the <tt>n</tt>-grams (words) from the current | |
| 260 | +<a href="Document.html">Document</a>. | |
| 261 | +</p> | |
| 262 | +<pre> | |
| 263 | + ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] | |
| 264 | +</pre> | |
| 265 | + | |
| 266 | + </div> | |
| 267 | + </div> | |
| 268 | + | |
| 269 | + | |
| 270 | + <div id="method-M000026" class="method-detail"> | |
| 271 | + <a name="M000026"></a> | |
| 272 | + | |
| 273 | + <div class="method-heading"> | |
| 274 | + | |
| 275 | + <a href="Document.src/M000026.html" target="Code" class="method-signature" | |
| 276 | + onclick="popupCode('Document.src/M000026.html');return false;"> | |
| 277 | + | |
| 278 | + <span class="method-name">tf</span><span class="method-args">(s)</span> | |
| 279 | + | |
| 280 | + </a> | |
| 281 | + | |
| 282 | + </div> | |
| 283 | + | |
| 284 | + <div class="method-description"> | |
| 285 | + | |
| 286 | + <p> | |
| 287 | +Computes the term frequency of a given <b>word</b> <tt>s</tt>. | |
| 288 | +</p> | |
| 289 | +<pre> | |
| 290 | + tf("guitar") #=> 0.000380372765310004 | |
| 291 | +</pre> | |
| 292 | + | |
| 293 | + </div> | |
| 294 | + </div> | |
| 295 | + | |
| 296 | + | |
| 297 | + <h3 class="section-bar">Protected Instance methods</h3> | |
| 298 | + | |
| 299 | + | |
| 300 | + <div id="method-M000022" class="method-detail"> | |
| 301 | + <a name="M000022"></a> | |
| 302 | + | |
| 303 | + <div class="method-heading"> | |
| 304 | + | |
| 305 | + <a href="Document.src/M000022.html" target="Code" class="method-signature" | |
| 306 | + onclick="popupCode('Document.src/M000022.html');return false;"> | |
| 307 | + | |
| 308 | + <span class="method-name">format_words</span><span class="method-args">()</span> | |
| 309 | + | |
| 310 | + </a> | |
| 311 | + | |
| 312 | + </div> | |
| 313 | + | |
| 314 | + <div class="method-description"> | |
| 315 | + | |
| 316 | + <p> | |
| 317 | +Any non-word characters are removed from the words (see <a | |
| 318 | +href="http://perldoc.perl.org/perlre.html">perldoc.perl.org/perlre.html</a> | |
| 319 | +and the W special escape). | |
| 320 | +</p> | |
| 321 | +<p> | |
| 322 | +Protected function, only meant to by called at the initialization. | |
| 323 | +</p> | |
| 324 | + | |
| 325 | + </div> | |
| 326 | + </div> | |
| 327 | + | |
| 328 | + | |
| 329 | + | |
| 330 | + </div> | |
| 331 | + | |
| 332 | + | |
| 333 | + | |
| 334 | + | |
| 335 | + </div> | |
| 336 | + | |
| 337 | +<div id="validator-badges"> | |
| 338 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
| 339 | +</div> | |
| 340 | + | |
| 341 | +</body> | |
| 342 | +</html> |
doc/classes/Mirimiri/Document.src/M000022.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>format_words (Mirimiri::Document)</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 8 | +</head> | |
| 9 | +<body class="standalone-code"> | |
| 10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 34</span> | |
| 11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">format_words</span> | |
| 12 | + <span class="ruby-identifier">wo</span> = [] | |
| 13 | + | |
| 14 | + <span class="ruby-ivar">@doc_content</span>.<span class="ruby-identifier">split</span>.<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span> | |
| 15 | + <span class="ruby-identifier">w</span>.<span class="ruby-identifier">split</span>(<span class="ruby-regexp re">/\W/</span>).<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">sw</span><span class="ruby-operator">|</span> | |
| 16 | + <span class="ruby-identifier">wo</span>.<span class="ruby-identifier">push</span>(<span class="ruby-identifier">sw</span>.<span class="ruby-identifier">downcase</span>) <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">sw</span> <span class="ruby-operator">=~</span> <span class="ruby-regexp re">/[a-zA-Z]/</span> | |
| 17 | + <span class="ruby-keyword kw">end</span> | |
| 18 | + <span class="ruby-keyword kw">end</span> | |
| 19 | + | |
| 20 | + <span class="ruby-identifier">wo</span> | |
| 21 | + <span class="ruby-keyword kw">end</span></pre> | |
| 22 | +</body> | |
| 23 | +</html> |
doc/classes/Mirimiri/Document.src/M000023.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>ngrams (Mirimiri::Document)</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 8 | +</head> | |
| 9 | +<body class="standalone-code"> | |
| 10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 49</span> | |
| 11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">ngrams</span>(<span class="ruby-identifier">n</span>) | |
| 12 | + <span class="ruby-identifier">window</span> = [] | |
| 13 | + <span class="ruby-identifier">ngrams_array</span> = [] | |
| 14 | + | |
| 15 | + <span class="ruby-ivar">@words</span>.<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span> | |
| 16 | + <span class="ruby-identifier">window</span>.<span class="ruby-identifier">push</span>(<span class="ruby-identifier">w</span>) | |
| 17 | + <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">window</span>.<span class="ruby-identifier">size</span> <span class="ruby-operator">==</span> <span class="ruby-identifier">n</span> | |
| 18 | + <span class="ruby-identifier">ngrams_array</span>.<span class="ruby-identifier">push</span> <span class="ruby-identifier">window</span>.<span class="ruby-identifier">join</span>(<span class="ruby-value str">" "</span>) | |
| 19 | + <span class="ruby-identifier">window</span>.<span class="ruby-identifier">delete_at</span>(<span class="ruby-value">0</span>) | |
| 20 | + <span class="ruby-keyword kw">end</span> | |
| 21 | + <span class="ruby-keyword kw">end</span> | |
| 22 | + | |
| 23 | + <span class="ruby-identifier">ngrams_array</span>.<span class="ruby-identifier">uniq</span> | |
| 24 | + <span class="ruby-keyword kw">end</span></pre> | |
| 25 | +</body> | |
| 26 | +</html> |
doc/classes/Mirimiri/Document.src/M000024.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>count_words (Mirimiri::Document)</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 8 | +</head> | |
| 9 | +<body class="standalone-code"> | |
| 10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 67</span> | |
| 11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">count_words</span> | |
| 12 | + <span class="ruby-identifier">counts</span> = <span class="ruby-constant">Hash</span>.<span class="ruby-identifier">new</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">h</span>,<span class="ruby-identifier">k</span><span class="ruby-operator">|</span> <span class="ruby-identifier">h</span>[<span class="ruby-identifier">k</span>] = <span class="ruby-value">0</span> } | |
| 13 | + <span class="ruby-ivar">@words</span>.<span class="ruby-identifier">each</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span> <span class="ruby-identifier">counts</span>[<span class="ruby-identifier">w</span>] <span class="ruby-operator">+=</span> <span class="ruby-value">1</span> } | |
| 14 | + | |
| 15 | + <span class="ruby-identifier">counts</span> | |
| 16 | + <span class="ruby-keyword kw">end</span></pre> | |
| 17 | +</body> | |
| 18 | +</html> |
doc/classes/Mirimiri/Document.src/M000025.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>entropy (Mirimiri::Document)</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 8 | +</head> | |
| 9 | +<body class="standalone-code"> | |
| 10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 81</span> | |
| 11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">entropy</span>(<span class="ruby-identifier">s</span>) | |
| 12 | + <span class="ruby-identifier">en</span> = <span class="ruby-value">0</span><span class="ruby-value">.0</span> | |
| 13 | + <span class="ruby-identifier">counts</span> = <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">count_words</span> | |
| 14 | + | |
| 15 | + <span class="ruby-identifier">s</span>.<span class="ruby-identifier">split</span>.<span class="ruby-identifier">each</span> <span class="ruby-keyword kw">do</span> <span class="ruby-operator">|</span><span class="ruby-identifier">w</span><span class="ruby-operator">|</span> | |
| 16 | + <span class="ruby-identifier">p_wi</span> = <span class="ruby-identifier">counts</span>[<span class="ruby-identifier">w</span>].<span class="ruby-identifier">to_f</span><span class="ruby-operator">/</span><span class="ruby-ivar">@words</span>.<span class="ruby-identifier">count</span>.<span class="ruby-identifier">to_f</span> | |
| 17 | + <span class="ruby-identifier">en</span> <span class="ruby-operator">+=</span> <span class="ruby-identifier">p_wi</span><span class="ruby-operator">*</span><span class="ruby-constant">Math</span>.<span class="ruby-identifier">log2</span>(<span class="ruby-identifier">p_wi</span>) | |
| 18 | + <span class="ruby-keyword kw">end</span> | |
| 19 | + | |
| 20 | + <span class="ruby-identifier">en</span> <span class="ruby-operator">*=</span> <span class="ruby-value">-1</span> | |
| 21 | + <span class="ruby-identifier">en</span> | |
| 22 | + <span class="ruby-keyword kw">end</span></pre> | |
| 23 | +</body> | |
| 24 | +</html> |
doc/classes/Mirimiri/Document.src/M000026.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>tf (Mirimiri::Document)</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 8 | +</head> | |
| 9 | +<body class="standalone-code"> | |
| 10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 97</span> | |
| 11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">tf</span>(<span class="ruby-identifier">s</span>) | |
| 12 | + <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">count_words</span>[<span class="ruby-identifier">s</span>].<span class="ruby-identifier">to_f</span><span class="ruby-operator">/</span><span class="ruby-ivar">@words</span>.<span class="ruby-identifier">size</span>.<span class="ruby-identifier">to_f</span> | |
| 13 | + <span class="ruby-keyword kw">end</span></pre> | |
| 14 | +</body> | |
| 15 | +</html> |
doc/classes/Mirimiri/Document.src/M000027.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>new (Mirimiri::Document)</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 8 | +</head> | |
| 9 | +<body class="standalone-code"> | |
| 10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 102</span> | |
| 11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">content</span>) | |
| 12 | + <span class="ruby-ivar">@doc_content</span> = <span class="ruby-identifier">content</span> | |
| 13 | + <span class="ruby-ivar">@words</span> = <span class="ruby-identifier">format_words</span> | |
| 14 | + <span class="ruby-keyword kw">end</span></pre> | |
| 15 | +</body> | |
| 16 | +</html> |
doc/classes/Mirimiri/WebDocument.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>Class: Mirimiri::WebDocument [RDoc Documentation]</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
| 8 | + <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 9 | + <script type="text/javascript"> | |
| 10 | + // <![CDATA[ | |
| 11 | + | |
| 12 | + function popupCode( url ) { | |
| 13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
| 14 | + } | |
| 15 | + | |
| 16 | + function toggleCode( id ) { | |
| 17 | + if ( document.getElementById ) | |
| 18 | + elem = document.getElementById( id ); | |
| 19 | + else if ( document.all ) | |
| 20 | + elem = eval( "document.all." + id ); | |
| 21 | + else | |
| 22 | + return false; | |
| 23 | + | |
| 24 | + elemStyle = elem.style; | |
| 25 | + | |
| 26 | + if ( elemStyle.display != "block" ) { | |
| 27 | + elemStyle.display = "block" | |
| 28 | + } else { | |
| 29 | + elemStyle.display = "none" | |
| 30 | + } | |
| 31 | + | |
| 32 | + return true; | |
| 33 | + } | |
| 34 | + | |
| 35 | + // Make codeblocks hidden by default | |
| 36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
| 37 | + | |
| 38 | + // ]]> | |
| 39 | + </script> | |
| 40 | + | |
| 41 | +</head> | |
| 42 | +<body> | |
| 43 | + | |
| 44 | + | |
| 45 | + <div id="classHeader"> | |
| 46 | + <table class="header-table"> | |
| 47 | + <tr class="top-aligned-row"> | |
| 48 | + <td><strong>Class</strong></td> | |
| 49 | + <td class="class-name-in-header">Mirimiri::WebDocument</td> | |
| 50 | + </tr> | |
| 51 | + <tr class="top-aligned-row"> | |
| 52 | + <td><strong>In:</strong></td> | |
| 53 | + <td> | |
| 54 | + | |
| 55 | + | |
| 56 | + <a href="../../files/lib/mirimiri/document_rb.html"> | |
| 57 | + | |
| 58 | + lib/mirimiri/document.rb | |
| 59 | + | |
| 60 | + </a> | |
| 61 | + | |
| 62 | + | |
| 63 | + <br /> | |
| 64 | + | |
| 65 | + </td> | |
| 66 | + </tr> | |
| 67 | + | |
| 68 | + | |
| 69 | + <tr class="top-aligned-row"> | |
| 70 | + <td><strong>Parent:</strong></td> | |
| 71 | + <td> | |
| 72 | + | |
| 73 | + <a href="Document.html"> | |
| 74 | + | |
| 75 | + Mirimiri::Document | |
| 76 | + | |
| 77 | + </a> | |
| 78 | + | |
| 79 | + </td> | |
| 80 | + </tr> | |
| 81 | + | |
| 82 | + </table> | |
| 83 | + </div> | |
| 84 | + <!-- banner header --> | |
| 85 | + | |
| 86 | + <div id="bodyContent"> | |
| 87 | + | |
| 88 | + <div id="contextContent"> | |
| 89 | + | |
| 90 | + <div id="description"> | |
| 91 | + <p> | |
| 92 | +A <a href="WebDocument.html">WebDocument</a> is a <a | |
| 93 | +href="Document.html">Document</a> with a <tt>url</tt>. | |
| 94 | +</p> | |
| 95 | + | |
| 96 | + </div> | |
| 97 | + | |
| 98 | + </div> | |
| 99 | + | |
| 100 | + | |
| 101 | + <div id="method-list"> | |
| 102 | + <h3 class="section-bar">Methods</h3> | |
| 103 | + | |
| 104 | + <div class="name-list"> | |
| 105 | + | |
| 106 | + <a href="#M000028">get_content</a> | |
| 107 | + | |
| 108 | + <a href="#M000029">new</a> | |
| 109 | + | |
| 110 | + </div> | |
| 111 | + </div> | |
| 112 | + | |
| 113 | + </div> | |
| 114 | + | |
| 115 | + <!-- if includes --> | |
| 116 | + | |
| 117 | + <div id="section"> | |
| 118 | + | |
| 119 | + | |
| 120 | + | |
| 121 | + <div id="attribute-list"> | |
| 122 | + <h3 class="section-bar">Attributes</h3> | |
| 123 | + | |
| 124 | + <div class="name-list"> | |
| 125 | + <table> | |
| 126 | + | |
| 127 | + <tr class="top-aligned-row context-row"> | |
| 128 | + <td class="context-item-name">url</td> | |
| 129 | + | |
| 130 | + <td class="context-item-value"> [R] </td> | |
| 131 | + | |
| 132 | + <td class="context-item-desc"></td> | |
| 133 | + </tr> | |
| 134 | + | |
| 135 | + </table> | |
| 136 | + </div> | |
| 137 | + </div> | |
| 138 | + | |
| 139 | + | |
| 140 | + <!-- if method_list --> | |
| 141 | + | |
| 142 | + <div id="methods"> | |
| 143 | + | |
| 144 | + <h3 class="section-bar">Public Class methods</h3> | |
| 145 | + | |
| 146 | + | |
| 147 | + <div id="method-M000028" class="method-detail"> | |
| 148 | + <a name="M000028"></a> | |
| 149 | + | |
| 150 | + <div class="method-heading"> | |
| 151 | + | |
| 152 | + <a href="WebDocument.src/M000028.html" target="Code" class="method-signature" | |
| 153 | + onclick="popupCode('WebDocument.src/M000028.html');return false;"> | |
| 154 | + | |
| 155 | + <span class="method-name">get_content</span><span class="method-args">(url)</span> | |
| 156 | + | |
| 157 | + </a> | |
| 158 | + | |
| 159 | + </div> | |
| 160 | + | |
| 161 | + <div class="method-description"> | |
| 162 | + | |
| 163 | + <p> | |
| 164 | +Returns the HTML text from the page of a given <tt>url</tt>. | |
| 165 | +</p> | |
| 166 | + | |
| 167 | + </div> | |
| 168 | + </div> | |
| 169 | + | |
| 170 | + | |
| 171 | + <div id="method-M000029" class="method-detail"> | |
| 172 | + <a name="M000029"></a> | |
| 173 | + | |
| 174 | + <div class="method-heading"> | |
| 175 | + | |
| 176 | + <a href="WebDocument.src/M000029.html" target="Code" class="method-signature" | |
| 177 | + onclick="popupCode('WebDocument.src/M000029.html');return false;"> | |
| 178 | + | |
| 179 | + <span class="method-name">new</span><span class="method-args">(url)</span> | |
| 180 | + | |
| 181 | + </a> | |
| 182 | + | |
| 183 | + </div> | |
| 184 | + | |
| 185 | + <div class="method-description"> | |
| 186 | + | |
| 187 | + <p> | |
| 188 | +<a href="WebDocument.html">WebDocument</a> constructor, the content of the | |
| 189 | +<a href="Document.html">Document</a> is the HTML page without the tags. | |
| 190 | +</p> | |
| 191 | + | |
| 192 | + </div> | |
| 193 | + </div> | |
| 194 | + | |
| 195 | + | |
| 196 | + | |
| 197 | + </div> | |
| 198 | + | |
| 199 | + | |
| 200 | + | |
| 201 | + | |
| 202 | + </div> | |
| 203 | + | |
| 204 | +<div id="validator-badges"> | |
| 205 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
| 206 | +</div> | |
| 207 | + | |
| 208 | +</body> | |
| 209 | +</html> |
doc/classes/Mirimiri/WebDocument.src/M000028.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>get_content (Mirimiri::WebDocument)</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 8 | +</head> | |
| 9 | +<body class="standalone-code"> | |
| 10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 115</span> | |
| 11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">get_content</span>(<span class="ruby-identifier">url</span>) | |
| 12 | + <span class="ruby-identifier">require</span> <span class="ruby-value str">'net/http'</span> | |
| 13 | + <span class="ruby-constant">Net</span><span class="ruby-operator">::</span><span class="ruby-constant">HTTP</span>.<span class="ruby-identifier">get</span>(<span class="ruby-constant">URI</span>.<span class="ruby-identifier">parse</span>(<span class="ruby-identifier">url</span>)) | |
| 14 | + <span class="ruby-keyword kw">end</span></pre> | |
| 15 | +</body> | |
| 16 | +</html> |
doc/classes/Mirimiri/WebDocument.src/M000029.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>new (Mirimiri::WebDocument)</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 8 | +</head> | |
| 9 | +<body class="standalone-code"> | |
| 10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 122</span> | |
| 11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-identifier">initialize</span>(<span class="ruby-identifier">url</span>) | |
| 12 | + <span class="ruby-ivar">@url</span> = <span class="ruby-identifier">url</span> | |
| 13 | + <span class="ruby-keyword kw">super</span> <span class="ruby-constant">WebDocument</span>.<span class="ruby-identifier">get_content</span>(<span class="ruby-identifier">url</span>).<span class="ruby-identifier">strip_javascripts</span>.<span class="ruby-identifier">strip_stylesheets</span>.<span class="ruby-identifier">strip_xml_tags</span> | |
| 14 | + <span class="ruby-keyword kw">end</span></pre> | |
| 15 | +</body> | |
| 16 | +</html> |
doc/classes/Mirimiri/WikipediaPage.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>Class: Mirimiri::WikipediaPage [RDoc Documentation]</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
| 8 | + <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 9 | + <script type="text/javascript"> | |
| 10 | + // <![CDATA[ | |
| 11 | + | |
| 12 | + function popupCode( url ) { | |
| 13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
| 14 | + } | |
| 15 | + | |
| 16 | + function toggleCode( id ) { | |
| 17 | + if ( document.getElementById ) | |
| 18 | + elem = document.getElementById( id ); | |
| 19 | + else if ( document.all ) | |
| 20 | + elem = eval( "document.all." + id ); | |
| 21 | + else | |
| 22 | + return false; | |
| 23 | + | |
| 24 | + elemStyle = elem.style; | |
| 25 | + | |
| 26 | + if ( elemStyle.display != "block" ) { | |
| 27 | + elemStyle.display = "block" | |
| 28 | + } else { | |
| 29 | + elemStyle.display = "none" | |
| 30 | + } | |
| 31 | + | |
| 32 | + return true; | |
| 33 | + } | |
| 34 | + | |
| 35 | + // Make codeblocks hidden by default | |
| 36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
| 37 | + | |
| 38 | + // ]]> | |
| 39 | + </script> | |
| 40 | + | |
| 41 | +</head> | |
| 42 | +<body> | |
| 43 | + | |
| 44 | + | |
| 45 | + <div id="classHeader"> | |
| 46 | + <table class="header-table"> | |
| 47 | + <tr class="top-aligned-row"> | |
| 48 | + <td><strong>Class</strong></td> | |
| 49 | + <td class="class-name-in-header">Mirimiri::WikipediaPage</td> | |
| 50 | + </tr> | |
| 51 | + <tr class="top-aligned-row"> | |
| 52 | + <td><strong>In:</strong></td> | |
| 53 | + <td> | |
| 54 | + | |
| 55 | + | |
| 56 | + <a href="../../files/lib/mirimiri/document_rb.html"> | |
| 57 | + | |
| 58 | + lib/mirimiri/document.rb | |
| 59 | + | |
| 60 | + </a> | |
| 61 | + | |
| 62 | + | |
| 63 | + <br /> | |
| 64 | + | |
| 65 | + </td> | |
| 66 | + </tr> | |
| 67 | + | |
| 68 | + | |
| 69 | + <tr class="top-aligned-row"> | |
| 70 | + <td><strong>Parent:</strong></td> | |
| 71 | + <td> | |
| 72 | + | |
| 73 | + <a href="WebDocument.html"> | |
| 74 | + | |
| 75 | + Mirimiri::WebDocument | |
| 76 | + | |
| 77 | + </a> | |
| 78 | + | |
| 79 | + </td> | |
| 80 | + </tr> | |
| 81 | + | |
| 82 | + </table> | |
| 83 | + </div> | |
| 84 | + <!-- banner header --> | |
| 85 | + | |
| 86 | + <div id="bodyContent"> | |
| 87 | + | |
| 88 | + <div id="contextContent"> | |
| 89 | + | |
| 90 | + <div id="description"> | |
| 91 | + <p> | |
| 92 | +A <a href="WikipediaPage.html">WikipediaPage</a> is a <a | |
| 93 | +href="WebDocument.html">WebDocument</a>. | |
| 94 | +</p> | |
| 95 | + | |
| 96 | + </div> | |
| 97 | + | |
| 98 | + </div> | |
| 99 | + | |
| 100 | + | |
| 101 | + <div id="method-list"> | |
| 102 | + <h3 class="section-bar">Methods</h3> | |
| 103 | + | |
| 104 | + <div class="name-list"> | |
| 105 | + | |
| 106 | + <a href="#M000031">get_url</a> | |
| 107 | + | |
| 108 | + <a href="#M000032">search_homepage</a> | |
| 109 | + | |
| 110 | + <a href="#M000030">search_wikipedia_titles</a> | |
| 111 | + | |
| 112 | + </div> | |
| 113 | + </div> | |
| 114 | + | |
| 115 | + </div> | |
| 116 | + | |
| 117 | + <!-- if includes --> | |
| 118 | + | |
| 119 | + <div id="section"> | |
| 120 | + | |
| 121 | + | |
| 122 | + | |
| 123 | + | |
| 124 | + <!-- if method_list --> | |
| 125 | + | |
| 126 | + <div id="methods"> | |
| 127 | + | |
| 128 | + <h3 class="section-bar">Public Class methods</h3> | |
| 129 | + | |
| 130 | + | |
| 131 | + <div id="method-M000031" class="method-detail"> | |
| 132 | + <a name="M000031"></a> | |
| 133 | + | |
| 134 | + <div class="method-heading"> | |
| 135 | + | |
| 136 | + <a href="WikipediaPage.src/M000031.html" target="Code" class="method-signature" | |
| 137 | + onclick="popupCode('WikipediaPage.src/M000031.html');return false;"> | |
| 138 | + | |
| 139 | + <span class="method-name">get_url</span><span class="method-args">(name)</span> | |
| 140 | + | |
| 141 | + </a> | |
| 142 | + | |
| 143 | + </div> | |
| 144 | + | |
| 145 | + <div class="method-description"> | |
| 146 | + | |
| 147 | + </div> | |
| 148 | + </div> | |
| 149 | + | |
| 150 | + | |
| 151 | + <div id="method-M000032" class="method-detail"> | |
| 152 | + <a name="M000032"></a> | |
| 153 | + | |
| 154 | + <div class="method-heading"> | |
| 155 | + | |
| 156 | + <a href="WikipediaPage.src/M000032.html" target="Code" class="method-signature" | |
| 157 | + onclick="popupCode('WikipediaPage.src/M000032.html');return false;"> | |
| 158 | + | |
| 159 | + <span class="method-name">search_homepage</span><span class="method-args">(name)</span> | |
| 160 | + | |
| 161 | + </a> | |
| 162 | + | |
| 163 | + </div> | |
| 164 | + | |
| 165 | + <div class="method-description"> | |
| 166 | + | |
| 167 | + </div> | |
| 168 | + </div> | |
| 169 | + | |
| 170 | + | |
| 171 | + <div id="method-M000030" class="method-detail"> | |
| 172 | + <a name="M000030"></a> | |
| 173 | + | |
| 174 | + <div class="method-heading"> | |
| 175 | + | |
| 176 | + <a href="WikipediaPage.src/M000030.html" target="Code" class="method-signature" | |
| 177 | + onclick="popupCode('WikipediaPage.src/M000030.html');return false;"> | |
| 178 | + | |
| 179 | + <span class="method-name">search_wikipedia_titles</span><span class="method-args">(name)</span> | |
| 180 | + | |
| 181 | + </a> | |
| 182 | + | |
| 183 | + </div> | |
| 184 | + | |
| 185 | + <div class="method-description"> | |
| 186 | + | |
| 187 | + </div> | |
| 188 | + </div> | |
| 189 | + | |
| 190 | + | |
| 191 | + | |
| 192 | + </div> | |
| 193 | + | |
| 194 | + | |
| 195 | + | |
| 196 | + | |
| 197 | + </div> | |
| 198 | + | |
| 199 | +<div id="validator-badges"> | |
| 200 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
| 201 | +</div> | |
| 202 | + | |
| 203 | +</body> | |
| 204 | +</html> |
doc/classes/Mirimiri/WikipediaPage.src/M000030.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>search_wikipedia_titles (Mirimiri::WikipediaPage)</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 8 | +</head> | |
| 9 | +<body class="standalone-code"> | |
| 10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 135</span> | |
| 11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">search_wikipedia_titles</span>(<span class="ruby-identifier">name</span>) | |
| 12 | + <span class="ruby-identifier">raise</span> <span class="ruby-constant">ArgumentError</span>, <span class="ruby-value str">"Bad encoding"</span>, <span class="ruby-identifier">name</span> <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">name</span>.<span class="ruby-identifier">isutf8</span> | |
| 13 | + | |
| 14 | + <span class="ruby-identifier">res</span> = <span class="ruby-constant">REXML</span><span class="ruby-operator">::</span><span class="ruby-constant">Document</span>.<span class="ruby-identifier">new</span>(<span class="ruby-constant">Net</span><span class="ruby-operator">::</span><span class="ruby-constant">HTTP</span>.<span class="ruby-identifier">get</span>( <span class="ruby-constant">URI</span>.<span class="ruby-identifier">parse</span> <span class="ruby-node">"http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml"</span> ).<span class="ruby-identifier">toutf8</span>).<span class="ruby-identifier">elements</span>[<span class="ruby-value str">'api/query/search'</span>] | |
| 15 | + | |
| 16 | + <span class="ruby-identifier">res</span>.<span class="ruby-identifier">collect</span> { <span class="ruby-operator">|</span><span class="ruby-identifier">e</span><span class="ruby-operator">|</span> <span class="ruby-identifier">e</span>.<span class="ruby-identifier">attributes</span>[<span class="ruby-value str">'title'</span>] } <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">res</span>.<span class="ruby-identifier">nil?</span> | |
| 17 | + <span class="ruby-keyword kw">end</span></pre> | |
| 18 | +</body> | |
| 19 | +</html> |
doc/classes/Mirimiri/WikipediaPage.src/M000031.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>get_url (Mirimiri::WikipediaPage)</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 8 | +</head> | |
| 9 | +<body class="standalone-code"> | |
| 10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 143</span> | |
| 11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">get_url</span>(<span class="ruby-identifier">name</span>) | |
| 12 | + <span class="ruby-identifier">raise</span> <span class="ruby-constant">ArgumentError</span>, <span class="ruby-value str">"Bad encoding"</span>, <span class="ruby-identifier">name</span> <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">name</span>.<span class="ruby-identifier">isutf8</span> | |
| 13 | + | |
| 14 | + <span class="ruby-identifier">atts</span> = <span class="ruby-constant">REXML</span><span class="ruby-operator">::</span><span class="ruby-constant">Document</span>.<span class="ruby-identifier">new</span>(<span class="ruby-constant">Net</span><span class="ruby-operator">::</span><span class="ruby-constant">HTTP</span>.<span class="ruby-identifier">get</span>( <span class="ruby-constant">URI</span>.<span class="ruby-identifier">parse</span> <span class="ruby-node">"http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml"</span> ).<span class="ruby-identifier">toutf8</span>).<span class="ruby-identifier">elements</span>[<span class="ruby-value str">'api/query/pages/page'</span>].<span class="ruby-identifier">attributes</span> | |
| 15 | + | |
| 16 | + <span class="ruby-identifier">atts</span>[<span class="ruby-value str">'fullurl'</span>] <span class="ruby-keyword kw">if</span> <span class="ruby-identifier">atts</span>[<span class="ruby-value str">'missing'</span>].<span class="ruby-identifier">nil?</span> | |
| 17 | + <span class="ruby-keyword kw">end</span></pre> | |
| 18 | +</body> | |
| 19 | +</html> |
doc/classes/Mirimiri/WikipediaPage.src/M000032.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>search_homepage (Mirimiri::WikipediaPage)</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 8 | +</head> | |
| 9 | +<body class="standalone-code"> | |
| 10 | + <pre><span class="ruby-comment cmt"># File lib/mirimiri/document.rb, line 151</span> | |
| 11 | + <span class="ruby-keyword kw">def</span> <span class="ruby-keyword kw">self</span>.<span class="ruby-identifier">search_homepage</span>(<span class="ruby-identifier">name</span>) | |
| 12 | + <span class="ruby-identifier">title</span> = <span class="ruby-constant">WikipediaPage</span>.<span class="ruby-identifier">search_wikipedia_titles</span> <span class="ruby-identifier">name</span> | |
| 13 | + | |
| 14 | + <span class="ruby-constant">WikipediaPage</span>.<span class="ruby-identifier">new</span>(<span class="ruby-constant">WikipediaPage</span>.<span class="ruby-identifier">get_url</span> <span class="ruby-identifier">title</span>[<span class="ruby-value">0</span>]) <span class="ruby-keyword kw">unless</span> <span class="ruby-identifier">title</span>.<span class="ruby-identifier">nil?</span> <span class="ruby-operator">||</span> <span class="ruby-identifier">title</span>.<span class="ruby-identifier">empty?</span> | |
| 15 | + <span class="ruby-keyword kw">end</span></pre> | |
| 16 | +</body> | |
| 17 | +</html> |
doc/files/lib/mirimiri/corpus_rb.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>File: corpus.rb [RDoc Documentation]</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
| 8 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 9 | + <script type="text/javascript"> | |
| 10 | + // <![CDATA[ | |
| 11 | + | |
| 12 | + function popupCode( url ) { | |
| 13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
| 14 | + } | |
| 15 | + | |
| 16 | + function toggleCode( id ) { | |
| 17 | + if ( document.getElementById ) | |
| 18 | + elem = document.getElementById( id ); | |
| 19 | + else if ( document.all ) | |
| 20 | + elem = eval( "document.all." + id ); | |
| 21 | + else | |
| 22 | + return false; | |
| 23 | + | |
| 24 | + elemStyle = elem.style; | |
| 25 | + | |
| 26 | + if ( elemStyle.display != "block" ) { | |
| 27 | + elemStyle.display = "block" | |
| 28 | + } else { | |
| 29 | + elemStyle.display = "none" | |
| 30 | + } | |
| 31 | + | |
| 32 | + return true; | |
| 33 | + } | |
| 34 | + | |
| 35 | + // Make codeblocks hidden by default | |
| 36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
| 37 | + | |
| 38 | + // ]]> | |
| 39 | + </script> | |
| 40 | + | |
| 41 | +</head> | |
| 42 | +<body> | |
| 43 | + | |
| 44 | + | |
| 45 | + <div id="fileHeader"> | |
| 46 | + <h1>corpus.rb</h1> | |
| 47 | + <table class="header-table"> | |
| 48 | + <tr class="top-aligned-row"> | |
| 49 | + <td><strong>Path:</strong></td> | |
| 50 | + <td>lib/mirimiri/corpus.rb | |
| 51 | + | |
| 52 | + </td> | |
| 53 | + </tr> | |
| 54 | + <tr class="top-aligned-row"> | |
| 55 | + <td><strong>Last Update:</strong></td> | |
| 56 | + <td>2010-12-20 10:35:26 +0100</td> | |
| 57 | + </tr> | |
| 58 | + </table> | |
| 59 | + </div> | |
| 60 | + <!-- banner header --> | |
| 61 | + | |
| 62 | + <div id="bodyContent"> | |
| 63 | + | |
| 64 | + <div id="contextContent"> | |
| 65 | + | |
| 66 | + </div> | |
| 67 | + | |
| 68 | + | |
| 69 | + </div> | |
| 70 | + | |
| 71 | + <!-- if includes --> | |
| 72 | + | |
| 73 | + <div id="section"> | |
| 74 | + | |
| 75 | + | |
| 76 | + | |
| 77 | + | |
| 78 | + <!-- if method_list --> | |
| 79 | + | |
| 80 | + | |
| 81 | + | |
| 82 | + | |
| 83 | + </div> | |
| 84 | + | |
| 85 | +<div id="validator-badges"> | |
| 86 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
| 87 | +</div> | |
| 88 | + | |
| 89 | +</body> | |
| 90 | +</html> |
doc/files/lib/mirimiri/document_rb.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>File: document.rb [RDoc Documentation]</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
| 8 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 9 | + <script type="text/javascript"> | |
| 10 | + // <![CDATA[ | |
| 11 | + | |
| 12 | + function popupCode( url ) { | |
| 13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
| 14 | + } | |
| 15 | + | |
| 16 | + function toggleCode( id ) { | |
| 17 | + if ( document.getElementById ) | |
| 18 | + elem = document.getElementById( id ); | |
| 19 | + else if ( document.all ) | |
| 20 | + elem = eval( "document.all." + id ); | |
| 21 | + else | |
| 22 | + return false; | |
| 23 | + | |
| 24 | + elemStyle = elem.style; | |
| 25 | + | |
| 26 | + if ( elemStyle.display != "block" ) { | |
| 27 | + elemStyle.display = "block" | |
| 28 | + } else { | |
| 29 | + elemStyle.display = "none" | |
| 30 | + } | |
| 31 | + | |
| 32 | + return true; | |
| 33 | + } | |
| 34 | + | |
| 35 | + // Make codeblocks hidden by default | |
| 36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
| 37 | + | |
| 38 | + // ]]> | |
| 39 | + </script> | |
| 40 | + | |
| 41 | +</head> | |
| 42 | +<body> | |
| 43 | + | |
| 44 | + | |
| 45 | + <div id="fileHeader"> | |
| 46 | + <h1>document.rb</h1> | |
| 47 | + <table class="header-table"> | |
| 48 | + <tr class="top-aligned-row"> | |
| 49 | + <td><strong>Path:</strong></td> | |
| 50 | + <td>lib/mirimiri/document.rb | |
| 51 | + | |
| 52 | + </td> | |
| 53 | + </tr> | |
| 54 | + <tr class="top-aligned-row"> | |
| 55 | + <td><strong>Last Update:</strong></td> | |
| 56 | + <td>2010-12-20 10:36:07 +0100</td> | |
| 57 | + </tr> | |
| 58 | + </table> | |
| 59 | + </div> | |
| 60 | + <!-- banner header --> | |
| 61 | + | |
| 62 | + <div id="bodyContent"> | |
| 63 | + | |
| 64 | + <div id="contextContent"> | |
| 65 | + | |
| 66 | + <div id="requires-list"> | |
| 67 | + <h3 class="section-bar">Required files</h3> | |
| 68 | + | |
| 69 | + <div class="name-list"> | |
| 70 | + | |
| 71 | + net/http | |
| 72 | + | |
| 73 | + rexml/document | |
| 74 | + | |
| 75 | + net/http | |
| 76 | + | |
| 77 | + kconv | |
| 78 | + | |
| 79 | + </div> | |
| 80 | + </div> | |
| 81 | + | |
| 82 | + </div> | |
| 83 | + | |
| 84 | + | |
| 85 | + </div> | |
| 86 | + | |
| 87 | + <!-- if includes --> | |
| 88 | + | |
| 89 | + <div id="section"> | |
| 90 | + | |
| 91 | + | |
| 92 | + | |
| 93 | + | |
| 94 | + <!-- if method_list --> | |
| 95 | + | |
| 96 | + | |
| 97 | + | |
| 98 | + | |
| 99 | + </div> | |
| 100 | + | |
| 101 | +<div id="validator-badges"> | |
| 102 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
| 103 | +</div> | |
| 104 | + | |
| 105 | +</body> | |
| 106 | +</html> |
doc/files/lib/mirimiri/query_rb.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>File: query.rb [RDoc Documentation]</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
| 8 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 9 | + <script type="text/javascript"> | |
| 10 | + // <![CDATA[ | |
| 11 | + | |
| 12 | + function popupCode( url ) { | |
| 13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
| 14 | + } | |
| 15 | + | |
| 16 | + function toggleCode( id ) { | |
| 17 | + if ( document.getElementById ) | |
| 18 | + elem = document.getElementById( id ); | |
| 19 | + else if ( document.all ) | |
| 20 | + elem = eval( "document.all." + id ); | |
| 21 | + else | |
| 22 | + return false; | |
| 23 | + | |
| 24 | + elemStyle = elem.style; | |
| 25 | + | |
| 26 | + if ( elemStyle.display != "block" ) { | |
| 27 | + elemStyle.display = "block" | |
| 28 | + } else { | |
| 29 | + elemStyle.display = "none" | |
| 30 | + } | |
| 31 | + | |
| 32 | + return true; | |
| 33 | + } | |
| 34 | + | |
| 35 | + // Make codeblocks hidden by default | |
| 36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
| 37 | + | |
| 38 | + // ]]> | |
| 39 | + </script> | |
| 40 | + | |
| 41 | +</head> | |
| 42 | +<body> | |
| 43 | + | |
| 44 | + | |
| 45 | + <div id="fileHeader"> | |
| 46 | + <h1>query.rb</h1> | |
| 47 | + <table class="header-table"> | |
| 48 | + <tr class="top-aligned-row"> | |
| 49 | + <td><strong>Path:</strong></td> | |
| 50 | + <td>lib/mirimiri/query.rb | |
| 51 | + | |
| 52 | + </td> | |
| 53 | + </tr> | |
| 54 | + <tr class="top-aligned-row"> | |
| 55 | + <td><strong>Last Update:</strong></td> | |
| 56 | + <td>2010-12-20 10:36:27 +0100</td> | |
| 57 | + </tr> | |
| 58 | + </table> | |
| 59 | + </div> | |
| 60 | + <!-- banner header --> | |
| 61 | + | |
| 62 | + <div id="bodyContent"> | |
| 63 | + | |
| 64 | + <div id="contextContent"> | |
| 65 | + | |
| 66 | + </div> | |
| 67 | + | |
| 68 | + | |
| 69 | + </div> | |
| 70 | + | |
| 71 | + <!-- if includes --> | |
| 72 | + | |
| 73 | + <div id="section"> | |
| 74 | + | |
| 75 | + | |
| 76 | + | |
| 77 | + | |
| 78 | + <!-- if method_list --> | |
| 79 | + | |
| 80 | + | |
| 81 | + | |
| 82 | + | |
| 83 | + </div> | |
| 84 | + | |
| 85 | +<div id="validator-badges"> | |
| 86 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
| 87 | +</div> | |
| 88 | + | |
| 89 | +</body> | |
| 90 | +</html> |
doc/files/lib/mirimiri/regexp_rb.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>File: regexp.rb [RDoc Documentation]</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
| 8 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 9 | + <script type="text/javascript"> | |
| 10 | + // <![CDATA[ | |
| 11 | + | |
| 12 | + function popupCode( url ) { | |
| 13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
| 14 | + } | |
| 15 | + | |
| 16 | + function toggleCode( id ) { | |
| 17 | + if ( document.getElementById ) | |
| 18 | + elem = document.getElementById( id ); | |
| 19 | + else if ( document.all ) | |
| 20 | + elem = eval( "document.all." + id ); | |
| 21 | + else | |
| 22 | + return false; | |
| 23 | + | |
| 24 | + elemStyle = elem.style; | |
| 25 | + | |
| 26 | + if ( elemStyle.display != "block" ) { | |
| 27 | + elemStyle.display = "block" | |
| 28 | + } else { | |
| 29 | + elemStyle.display = "none" | |
| 30 | + } | |
| 31 | + | |
| 32 | + return true; | |
| 33 | + } | |
| 34 | + | |
| 35 | + // Make codeblocks hidden by default | |
| 36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
| 37 | + | |
| 38 | + // ]]> | |
| 39 | + </script> | |
| 40 | + | |
| 41 | +</head> | |
| 42 | +<body> | |
| 43 | + | |
| 44 | + | |
| 45 | + <div id="fileHeader"> | |
| 46 | + <h1>regexp.rb</h1> | |
| 47 | + <table class="header-table"> | |
| 48 | + <tr class="top-aligned-row"> | |
| 49 | + <td><strong>Path:</strong></td> | |
| 50 | + <td>lib/mirimiri/regexp.rb | |
| 51 | + | |
| 52 | + </td> | |
| 53 | + </tr> | |
| 54 | + <tr class="top-aligned-row"> | |
| 55 | + <td><strong>Last Update:</strong></td> | |
| 56 | + <td>2010-12-20 10:36:42 +0100</td> | |
| 57 | + </tr> | |
| 58 | + </table> | |
| 59 | + </div> | |
| 60 | + <!-- banner header --> | |
| 61 | + | |
| 62 | + <div id="bodyContent"> | |
| 63 | + | |
| 64 | + <div id="contextContent"> | |
| 65 | + | |
| 66 | + </div> | |
| 67 | + | |
| 68 | + | |
| 69 | + </div> | |
| 70 | + | |
| 71 | + <!-- if includes --> | |
| 72 | + | |
| 73 | + <div id="section"> | |
| 74 | + | |
| 75 | + | |
| 76 | + | |
| 77 | + | |
| 78 | + <!-- if method_list --> | |
| 79 | + | |
| 80 | + | |
| 81 | + | |
| 82 | + | |
| 83 | + </div> | |
| 84 | + | |
| 85 | +<div id="validator-badges"> | |
| 86 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
| 87 | +</div> | |
| 88 | + | |
| 89 | +</body> | |
| 90 | +</html> |
doc/files/lib/mirimiri/string_rb.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>File: string.rb [RDoc Documentation]</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
| 8 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 9 | + <script type="text/javascript"> | |
| 10 | + // <![CDATA[ | |
| 11 | + | |
| 12 | + function popupCode( url ) { | |
| 13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
| 14 | + } | |
| 15 | + | |
| 16 | + function toggleCode( id ) { | |
| 17 | + if ( document.getElementById ) | |
| 18 | + elem = document.getElementById( id ); | |
| 19 | + else if ( document.all ) | |
| 20 | + elem = eval( "document.all." + id ); | |
| 21 | + else | |
| 22 | + return false; | |
| 23 | + | |
| 24 | + elemStyle = elem.style; | |
| 25 | + | |
| 26 | + if ( elemStyle.display != "block" ) { | |
| 27 | + elemStyle.display = "block" | |
| 28 | + } else { | |
| 29 | + elemStyle.display = "none" | |
| 30 | + } | |
| 31 | + | |
| 32 | + return true; | |
| 33 | + } | |
| 34 | + | |
| 35 | + // Make codeblocks hidden by default | |
| 36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
| 37 | + | |
| 38 | + // ]]> | |
| 39 | + </script> | |
| 40 | + | |
| 41 | +</head> | |
| 42 | +<body> | |
| 43 | + | |
| 44 | + | |
| 45 | + <div id="fileHeader"> | |
| 46 | + <h1>string.rb</h1> | |
| 47 | + <table class="header-table"> | |
| 48 | + <tr class="top-aligned-row"> | |
| 49 | + <td><strong>Path:</strong></td> | |
| 50 | + <td>lib/mirimiri/string.rb | |
| 51 | + | |
| 52 | + </td> | |
| 53 | + </tr> | |
| 54 | + <tr class="top-aligned-row"> | |
| 55 | + <td><strong>Last Update:</strong></td> | |
| 56 | + <td>2010-12-20 10:37:16 +0100</td> | |
| 57 | + </tr> | |
| 58 | + </table> | |
| 59 | + </div> | |
| 60 | + <!-- banner header --> | |
| 61 | + | |
| 62 | + <div id="bodyContent"> | |
| 63 | + | |
| 64 | + <div id="contextContent"> | |
| 65 | + | |
| 66 | + <div id="description"> | |
| 67 | + <hr size="1"></hr><p> | |
| 68 | +General module | |
| 69 | +</p> | |
| 70 | + | |
| 71 | + </div> | |
| 72 | + | |
| 73 | + <div id="requires-list"> | |
| 74 | + <h3 class="section-bar">Required files</h3> | |
| 75 | + | |
| 76 | + <div class="name-list"> | |
| 77 | + | |
| 78 | + cgi | |
| 79 | + | |
| 80 | + kconv | |
| 81 | + | |
| 82 | + </div> | |
| 83 | + </div> | |
| 84 | + | |
| 85 | + </div> | |
| 86 | + | |
| 87 | + | |
| 88 | + </div> | |
| 89 | + | |
| 90 | + <!-- if includes --> | |
| 91 | + | |
| 92 | + <div id="section"> | |
| 93 | + | |
| 94 | + | |
| 95 | + | |
| 96 | + | |
| 97 | + <!-- if method_list --> | |
| 98 | + | |
| 99 | + | |
| 100 | + | |
| 101 | + | |
| 102 | + </div> | |
| 103 | + | |
| 104 | +<div id="validator-badges"> | |
| 105 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
| 106 | +</div> | |
| 107 | + | |
| 108 | +</body> | |
| 109 | +</html> |
doc/files/lib/mirimiri/ttagger_rb.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>File: ttagger.rb [RDoc Documentation]</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
| 8 | + <link rel="stylesheet" href="../../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 9 | + <script type="text/javascript"> | |
| 10 | + // <![CDATA[ | |
| 11 | + | |
| 12 | + function popupCode( url ) { | |
| 13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
| 14 | + } | |
| 15 | + | |
| 16 | + function toggleCode( id ) { | |
| 17 | + if ( document.getElementById ) | |
| 18 | + elem = document.getElementById( id ); | |
| 19 | + else if ( document.all ) | |
| 20 | + elem = eval( "document.all." + id ); | |
| 21 | + else | |
| 22 | + return false; | |
| 23 | + | |
| 24 | + elemStyle = elem.style; | |
| 25 | + | |
| 26 | + if ( elemStyle.display != "block" ) { | |
| 27 | + elemStyle.display = "block" | |
| 28 | + } else { | |
| 29 | + elemStyle.display = "none" | |
| 30 | + } | |
| 31 | + | |
| 32 | + return true; | |
| 33 | + } | |
| 34 | + | |
| 35 | + // Make codeblocks hidden by default | |
| 36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
| 37 | + | |
| 38 | + // ]]> | |
| 39 | + </script> | |
| 40 | + | |
| 41 | +</head> | |
| 42 | +<body> | |
| 43 | + | |
| 44 | + | |
| 45 | + <div id="fileHeader"> | |
| 46 | + <h1>ttagger.rb</h1> | |
| 47 | + <table class="header-table"> | |
| 48 | + <tr class="top-aligned-row"> | |
| 49 | + <td><strong>Path:</strong></td> | |
| 50 | + <td>lib/mirimiri/ttagger.rb | |
| 51 | + | |
| 52 | + </td> | |
| 53 | + </tr> | |
| 54 | + <tr class="top-aligned-row"> | |
| 55 | + <td><strong>Last Update:</strong></td> | |
| 56 | + <td>2010-12-20 10:37:32 +0100</td> | |
| 57 | + </tr> | |
| 58 | + </table> | |
| 59 | + </div> | |
| 60 | + <!-- banner header --> | |
| 61 | + | |
| 62 | + <div id="bodyContent"> | |
| 63 | + | |
| 64 | + <div id="contextContent"> | |
| 65 | + | |
| 66 | + </div> | |
| 67 | + | |
| 68 | + | |
| 69 | + </div> | |
| 70 | + | |
| 71 | + <!-- if includes --> | |
| 72 | + | |
| 73 | + <div id="section"> | |
| 74 | + | |
| 75 | + | |
| 76 | + | |
| 77 | + | |
| 78 | + <!-- if method_list --> | |
| 79 | + | |
| 80 | + | |
| 81 | + | |
| 82 | + | |
| 83 | + </div> | |
| 84 | + | |
| 85 | +<div id="validator-badges"> | |
| 86 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
| 87 | +</div> | |
| 88 | + | |
| 89 | +</body> | |
| 90 | +</html> |
doc/files/lib/mirimiri_rb.html
| 1 | +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" | |
| 2 | +"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> | |
| 3 | +<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> | |
| 4 | +<head> | |
| 5 | + <title>File: mirimiri.rb [RDoc Documentation]</title> | |
| 6 | + <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> | |
| 7 | + <meta http-equiv="Content-Script-Type" content="text/javascript" /> | |
| 8 | + <link rel="stylesheet" href="../.././rdoc-style.css" type="text/css" media="screen" /> | |
| 9 | + <script type="text/javascript"> | |
| 10 | + // <![CDATA[ | |
| 11 | + | |
| 12 | + function popupCode( url ) { | |
| 13 | + window.open(url, "Code", "resizable=yes,scrollbars=yes,toolbar=no,status=no,height=150,width=400") | |
| 14 | + } | |
| 15 | + | |
| 16 | + function toggleCode( id ) { | |
| 17 | + if ( document.getElementById ) | |
| 18 | + elem = document.getElementById( id ); | |
| 19 | + else if ( document.all ) | |
| 20 | + elem = eval( "document.all." + id ); | |
| 21 | + else | |
| 22 | + return false; | |
| 23 | + | |
| 24 | + elemStyle = elem.style; | |
| 25 | + | |
| 26 | + if ( elemStyle.display != "block" ) { | |
| 27 | + elemStyle.display = "block" | |
| 28 | + } else { | |
| 29 | + elemStyle.display = "none" | |
| 30 | + } | |
| 31 | + | |
| 32 | + return true; | |
| 33 | + } | |
| 34 | + | |
| 35 | + // Make codeblocks hidden by default | |
| 36 | + document.writeln( "<style type=\"text/css\">div.method-source-code { display: none }<\/style>" ) | |
| 37 | + | |
| 38 | + // ]]> | |
| 39 | + </script> | |
| 40 | + | |
| 41 | +</head> | |
| 42 | +<body> | |
| 43 | + | |
| 44 | + | |
| 45 | + <div id="fileHeader"> | |
| 46 | + <h1>mirimiri.rb</h1> | |
| 47 | + <table class="header-table"> | |
| 48 | + <tr class="top-aligned-row"> | |
| 49 | + <td><strong>Path:</strong></td> | |
| 50 | + <td>lib/mirimiri.rb | |
| 51 | + | |
| 52 | + </td> | |
| 53 | + </tr> | |
| 54 | + <tr class="top-aligned-row"> | |
| 55 | + <td><strong>Last Update:</strong></td> | |
| 56 | + <td>2010-12-20 10:33:51 +0100</td> | |
| 57 | + </tr> | |
| 58 | + </table> | |
| 59 | + </div> | |
| 60 | + <!-- banner header --> | |
| 61 | + | |
| 62 | + <div id="bodyContent"> | |
| 63 | + | |
| 64 | + <div id="contextContent"> | |
| 65 | + | |
| 66 | + <div id="requires-list"> | |
| 67 | + <h3 class="section-bar">Required files</h3> | |
| 68 | + | |
| 69 | + <div class="name-list"> | |
| 70 | + | |
| 71 | + rir/document | |
| 72 | + | |
| 73 | + rir/string | |
| 74 | + | |
| 75 | + rir/query | |
| 76 | + | |
| 77 | + rir/corpus | |
| 78 | + | |
| 79 | + rir/regexp | |
| 80 | + | |
| 81 | + rir/ttagger | |
| 82 | + | |
| 83 | + </div> | |
| 84 | + </div> | |
| 85 | + | |
| 86 | + </div> | |
| 87 | + | |
| 88 | + | |
| 89 | + </div> | |
| 90 | + | |
| 91 | + <!-- if includes --> | |
| 92 | + | |
| 93 | + <div id="section"> | |
| 94 | + | |
| 95 | + | |
| 96 | + | |
| 97 | + | |
| 98 | + <!-- if method_list --> | |
| 99 | + | |
| 100 | + | |
| 101 | + | |
| 102 | + | |
| 103 | + </div> | |
| 104 | + | |
| 105 | +<div id="validator-badges"> | |
| 106 | + <p><small><a href="http://validator.w3.org/check/referer">[Validate]</a></small></p> | |
| 107 | +</div> | |
| 108 | + | |
| 109 | +</body> | |
| 110 | +</html> |
lib/mirimiri.rb
lib/mirimiri/corpus.rb
| 1 | +#!/usr/bin/env ruby | |
| 2 | + | |
| 3 | +#-- | |
| 4 | +# This file is a part of the mirimiri library | |
| 5 | +# | |
| 6 | +# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | |
| 7 | +# | |
| 8 | +# This program is free software: you can redistribute it and/or modify | |
| 9 | +# it under the terms of the GNU General Public License as published by | |
| 10 | +# the Free Software Foundation, either version 3 of the License, or | |
| 11 | +# (at your option) any later version. | |
| 12 | +# | |
| 13 | +# This program is distributed in the hope that it will be useful, | |
| 14 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 15 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 16 | +# GNU General Public License for more details. | |
| 17 | +# | |
| 18 | +# You should have received a copy of the GNU General Public License | |
| 19 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 20 | +#++ | |
| 21 | + | |
| 22 | +class Corpus | |
| 23 | + attr_accessor :path | |
| 24 | + | |
| 25 | + def initialize(path) | |
| 26 | + @path = path.chomp "/" | |
| 27 | + end | |
| 28 | + | |
| 29 | + # Recursively outputs all files in +self.path+. | |
| 30 | + # WARNING ! This function may take a lot of time if many | |
| 31 | + # files are in subdirectories. | |
| 32 | + # | |
| 33 | + # c = Corpus.new "my/path" | |
| 34 | + # c.files # => ["README.txt", "lib/code.rb"] | |
| 35 | + def files | |
| 36 | + Dir["#{@path}/**/*.*"] | |
| 37 | + end | |
| 38 | +end |
lib/mirimiri/document.rb
| 1 | +#!/usr/bin/env ruby | |
| 2 | + | |
| 3 | +#-- | |
| 4 | +# This file is a part of the mirimiri library | |
| 5 | +# | |
| 6 | +# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | |
| 7 | +# | |
| 8 | +# This program is free software: you can redistribute it and/or modify | |
| 9 | +# it under the terms of the GNU General Public License as published by | |
| 10 | +# the Free Software Foundation, either version 3 of the License, or | |
| 11 | +# (at your option) any later version. | |
| 12 | +# | |
| 13 | +# This program is distributed in the hope that it will be useful, | |
| 14 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 15 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 16 | +# GNU General Public License for more details. | |
| 17 | +# | |
| 18 | +# You should have received a copy of the GNU General Public License | |
| 19 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 20 | +#++ | |
| 21 | + | |
| 22 | + | |
| 23 | +# General module | |
| 24 | +module Mirimiri | |
| 25 | + | |
| 26 | + # A Document is a bag of words and is constructed from a string. | |
| 27 | + class Document | |
| 28 | + attr_reader :words, :doc_content | |
| 29 | + | |
| 30 | + # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html | |
| 31 | + # and the \\W special escape). | |
| 32 | + # | |
| 33 | + # Protected function, only meant to by called at the initialization. | |
| 34 | + def format_words | |
| 35 | + wo = [] | |
| 36 | + | |
| 37 | + @doc_content.split.each do |w| | |
| 38 | + w.split(/\W/).each do |sw| | |
| 39 | + wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ | |
| 40 | + end | |
| 41 | + end | |
| 42 | + | |
| 43 | + wo | |
| 44 | + end | |
| 45 | + | |
| 46 | + # Returns an Array containing the +n+-grams (words) from the current Document. | |
| 47 | + # | |
| 48 | + # ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...] | |
| 49 | + def ngrams(n) | |
| 50 | + window = [] | |
| 51 | + ngrams_array = [] | |
| 52 | + | |
| 53 | + @words.each do |w| | |
| 54 | + window.push(w) | |
| 55 | + if window.size == n | |
| 56 | + ngrams_array.push window.join(" ") | |
| 57 | + window.delete_at(0) | |
| 58 | + end | |
| 59 | + end | |
| 60 | + | |
| 61 | + ngrams_array.uniq | |
| 62 | + end | |
| 63 | + | |
| 64 | + # Returns a Hash containing the words and their associated counts in the current Document. | |
| 65 | + # | |
| 66 | + # count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... } | |
| 67 | + def count_words | |
| 68 | + counts = Hash.new { |h,k| h[k] = 0 } | |
| 69 | + @words.each { |w| counts[w] += 1 } | |
| 70 | + | |
| 71 | + counts | |
| 72 | + end | |
| 73 | + | |
| 74 | + # Computes the entropy of a given string +s+ inside the document. | |
| 75 | + # | |
| 76 | + # If the string parameter is composed of many words (i.e. tokens separated | |
| 77 | + # by whitespace(s)), it is considered as an ngram. | |
| 78 | + # | |
| 79 | + # entropy("guitar") #=> 0.00432114812727959 | |
| 80 | + # entropy("dillinger escape plan") #=> 0.265862076325102 | |
| 81 | + def entropy(s) | |
| 82 | + en = 0.0 | |
| 83 | + counts = self.count_words | |
| 84 | + | |
| 85 | + s.split.each do |w| | |
| 86 | + p_wi = counts[w].to_f/@words.count.to_f | |
| 87 | + en += p_wi*Math.log2(p_wi) | |
| 88 | + end | |
| 89 | + | |
| 90 | + en *= -1 | |
| 91 | + en | |
| 92 | + end | |
| 93 | + | |
| 94 | + # Computes the term frequency of a given *word* +s+. | |
| 95 | + # | |
| 96 | + # tf("guitar") #=> 0.000380372765310004 | |
| 97 | + def tf(s) | |
| 98 | + self.count_words[s].to_f/@words.size.to_f | |
| 99 | + end | |
| 100 | + | |
| 101 | + | |
| 102 | + def initialize(content) | |
| 103 | + @doc_content = content | |
| 104 | + @words = format_words | |
| 105 | + end | |
| 106 | + | |
| 107 | + protected :format_words | |
| 108 | + end | |
| 109 | + | |
| 110 | + # A WebDocument is a Document with a +url+. | |
| 111 | + class WebDocument < Document | |
| 112 | + attr_reader :url | |
| 113 | + | |
| 114 | + # Returns the HTML text from the page of a given +url+. | |
| 115 | + def self.get_content(url) | |
| 116 | + require 'net/http' | |
| 117 | + Net::HTTP.get(URI.parse(url)) | |
| 118 | + end | |
| 119 | + | |
| 120 | + # WebDocument constructor, the content of the Document is the HTML page | |
| 121 | + # without the tags. | |
| 122 | + def initialize(url) | |
| 123 | + @url = url | |
| 124 | + super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags | |
| 125 | + end | |
| 126 | + end | |
| 127 | + | |
| 128 | + # A WikipediaPage is a WebDocument. | |
| 129 | + class WikipediaPage < WebDocument | |
| 130 | + require 'rexml/document' | |
| 131 | + require 'net/http' | |
| 132 | + require 'kconv' | |
| 133 | + | |
| 134 | + | |
| 135 | + def self.search_wikipedia_titles(name) | |
| 136 | + raise ArgumentError, "Bad encoding", name unless name.isutf8 | |
| 137 | + | |
| 138 | + res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search'] | |
| 139 | + | |
| 140 | + res.collect { |e| e.attributes['title'] } unless res.nil? | |
| 141 | + end | |
| 142 | + | |
| 143 | + def self.get_url(name) | |
| 144 | + raise ArgumentError, "Bad encoding", name unless name.isutf8 | |
| 145 | + | |
| 146 | + atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes | |
| 147 | + | |
| 148 | + atts['fullurl'] if atts['missing'].nil? | |
| 149 | + end | |
| 150 | + | |
| 151 | + def self.search_homepage(name) | |
| 152 | + title = WikipediaPage.search_wikipedia_titles name | |
| 153 | + | |
| 154 | + WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty? | |
| 155 | + end | |
| 156 | + | |
| 157 | +# def initialize(name) | |
| 158 | +# title = WikipediaPage.search_wikipedia_titles name | |
| 159 | +# raise ArgumentError, "No page found" if title.empty? | |
| 160 | +# super WikipediaPage.get_url title[0] | |
| 161 | +# end | |
| 162 | + end | |
| 163 | +end |
lib/mirimiri/query.rb
| 1 | +#!/usr/bin/env ruby | |
| 2 | + | |
| 3 | +#-- | |
| 4 | +# This file is a part of the mirimiri library | |
| 5 | +# | |
| 6 | +# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | |
| 7 | +# | |
| 8 | +# This program is free software: you can redistribute it and/or modify | |
| 9 | +# it under the terms of the GNU General Public License as published by | |
| 10 | +# the Free Software Foundation, either version 3 of the License, or | |
| 11 | +# (at your option) any later version. | |
| 12 | +# | |
| 13 | +# This program is distributed in the hope that it will be useful, | |
| 14 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 15 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 16 | +# GNU General Public License for more details. | |
| 17 | +# | |
| 18 | +# You should have received a copy of the GNU General Public License | |
| 19 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 20 | +#++ | |
| 21 | + | |
| 22 | +class Query | |
| 23 | +end | |
| 24 | + | |
| 25 | +module Indri | |
| 26 | + | |
| 27 | + class Parameters | |
| 28 | + attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline | |
| 29 | + | |
| 30 | + def initialize(corpus,mem="1g",count="1000",offset="1",run_id="default",print_query=false,print_docs=false) | |
| 31 | + @index_path = corpus | |
| 32 | + @memory = mem | |
| 33 | + @count = count | |
| 34 | + @offset = offset | |
| 35 | + @run_id = run_id | |
| 36 | + @print_query = print_query ? "true" : "false" | |
| 37 | + @print_docs = print_docs ? "true" : "false" | |
| 38 | + end | |
| 39 | + | |
| 40 | + def to_s | |
| 41 | + h = "<parameters>\n" | |
| 42 | + h += "<memory>#{@memory}</memory>\n" | |
| 43 | + h += "<index>#{@index_path}</index>\n" | |
| 44 | + h += "<count>#{@count}</count>\n" | |
| 45 | + unless @baseline.nil? | |
| 46 | + h += "<baseline>#{@baseline}</baseline>\n" | |
| 47 | + else | |
| 48 | + h += "<rule>#{@rule}</rule>\n" | |
| 49 | + end | |
| 50 | + h += "<queryOffset>#{@offset}</queryOffset>\n" | |
| 51 | + h += "<runID>#{@run_id}</runID>\n" | |
| 52 | + h += "<printQuery>#{@print_query}</printQuery>\n" | |
| 53 | + h += "<printDocuments>#{@print_docs}</printDocuments>\n" | |
| 54 | + | |
| 55 | + h | |
| 56 | + end | |
| 57 | + end | |
| 58 | + | |
| 59 | + class IndriQuery < Query | |
| 60 | + attr_accessor :id, :query, :params, :rule | |
| 61 | + | |
| 62 | + def initialize(id,query,params) | |
| 63 | + @params = params | |
| 64 | + # Here we set the default retrieval model as Language Modeling | |
| 65 | + # with a Dirichlet smoothing at 2500. | |
| 66 | + # TODO: maybe a Rule class... | |
| 67 | + @params.rule = 'method:dirichlet,mu:2500' if @params.rule.nil? | |
| 68 | + | |
| 69 | + @id = id | |
| 70 | + @query = query | |
| 71 | + end | |
| 72 | + | |
| 73 | + def to_s | |
| 74 | + h = @params.to_s | |
| 75 | + h += "<query>\n" | |
| 76 | + h += "<number>#{@id}</number>\n" | |
| 77 | + h += "<text>#{@query}</text>\n" | |
| 78 | + h += "</query>\n" | |
| 79 | + h += "</parameters>" | |
| 80 | + | |
| 81 | + h | |
| 82 | + end | |
| 83 | + end | |
| 84 | + | |
| 85 | +end |
lib/mirimiri/regexp.rb
| 1 | +#!/usr/bin/env ruby | |
| 2 | + | |
| 3 | +#-- | |
| 4 | +# This file is a part of the mirimiri library | |
| 5 | +# | |
| 6 | +# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | |
| 7 | +# | |
| 8 | +# This program is free software: you can redistribute it and/or modify | |
| 9 | +# it under the terms of the GNU General Public License as published by | |
| 10 | +# the Free Software Foundation, either version 3 of the License, or | |
| 11 | +# (at your option) any later version. | |
| 12 | +# | |
| 13 | +# This program is distributed in the hope that it will be useful, | |
| 14 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 15 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 16 | +# GNU General Public License for more details. | |
| 17 | +# | |
| 18 | +# You should have received a copy of the GNU General Public License | |
| 19 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 20 | +#++ | |
| 21 | + | |
| 22 | +class Regexp | |
| 23 | + | |
| 24 | + def negated | |
| 25 | + /^((?!#{self}).)*$/ | |
| 26 | + end | |
| 27 | + | |
| 28 | +end |
lib/mirimiri/string.rb
| 1 | +#!/usr/bin/env ruby | |
| 2 | + | |
| 3 | +#-- | |
| 4 | +# This file is a part of the mirimiri library | |
| 5 | +# | |
| 6 | +# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | |
| 7 | +# | |
| 8 | +# This program is free software: you can redistribute it and/or modify | |
| 9 | +# it under the terms of the GNU General Public License as published by | |
| 10 | +# the Free Software Foundation, either version 3 of the License, or | |
| 11 | +# (at your option) any later version. | |
| 12 | +# | |
| 13 | +# This program is distributed in the hope that it will be useful, | |
| 14 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 15 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 16 | +# GNU General Public License for more details. | |
| 17 | +# | |
| 18 | +# You should have received a copy of the GNU General Public License | |
| 19 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 20 | +#++ | |
| 21 | + | |
| 22 | +module Mirimiri | |
| 23 | + | |
| 24 | + # These are the default stopwords provided by Lemur. | |
| 25 | + Stoplist = [ | |
| 26 | + "a", "anything", "anyway", "anywhere", "apart", "are", "around", "as", "at", "av", | |
| 27 | + "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", | |
| 28 | + "behind", "being", "below", "beside", "besides", "between", "beyond", "both", "but", "by", | |
| 29 | + "can", "cannot", "canst", "certain", "cf", "choose", "contrariwise", "cos", "could", "cu", | |
| 30 | + "day", "do", "does", "doesn't", "doing", "dost", "doth", "double", "down", "dual", "during", | |
| 31 | + "each", "either", "else", "elsewhere", "enough", "et", "etc", "even", "ever", "every", | |
| 32 | + "everybody", "everyone", "everything", "everywhere", "except", "excepted", "excepting", | |
| 33 | + "exception", "exclude", "excluding", "exclusive", "far", "farther", "farthest", "few", "ff", | |
| 34 | + "first", "for", "formerly", "forth", "forward", "from", "front", "further", "furthermore", | |
| 35 | + "furthest", "get", "go", "had", "halves", "hardly", "has", "hast", "hath", "have", "he", | |
| 36 | + "hence", "henceforth", "her", "here", "hereabouts", "hereafter", "hereby", "herein", "hereto", | |
| 37 | + "hereupon", "hers", "herself", "him", "himself", "hindmost", "his", "hither", "hitherto", | |
| 38 | + "how", "however", "howsoever", "i", "ie", "if", "in", "inasmuch", "inc", "include", | |
| 39 | + "included", "including", "indeed", "indoors", "inside", "insomuch", "instead", "into", | |
| 40 | + "inward", "inwards", "is", "it", "its", "itself", "just", "kind", "kg", "km", "last", | |
| 41 | + "latter", "latterly", "less", "lest", "let", "like", "little", "ltd", "many", "may", "maybe", | |
| 42 | + "me", "meantime", "meanwhile", "might", "moreover", "most", "mostly", "more", "mr", "mrs", | |
| 43 | + "ms", "much", "must", "my", "myself", "namely", "need", "neither", "never", "nevertheless", | |
| 44 | + "next", "no", "nobody", "none", "nonetheless", "noone", "nope", "nor", "not", "nothing", | |
| 45 | + "notwithstanding", "now", "nowadays", "nowhere", "of", "off", "often", "ok", "on", "once", | |
| 46 | + "one", "only", "onto", "or", "other", "others", "otherwise", "ought", "our", "ours", | |
| 47 | + "ourselves", "out", "outside", "over", "own", "per", "perhaps", "plenty", "provide", "quite", | |
| 48 | + "rather", "really", "round", "said", "sake", "same", "sang", "save", "saw", "see", "seeing", | |
| 49 | + "seem", "seemed", "seeming", "seems", "seen", "seldom", "selves", "sent", "several", "shalt", | |
| 50 | + "she", "should", "shown", "sideways", "since", "slept", "slew", "slung", "slunk", "smote", | |
| 51 | + "so", "some", "somebody", "somehow", "someone", "something", "sometime", "sometimes", | |
| 52 | + "somewhat", "somewhere", "spake", "spat", "spoke", "spoken", "sprang", "sprung", "stave", | |
| 53 | + "staves", "still", "such", "supposing", "than", "that", "the", "thee", "their", "them", | |
| 54 | + "themselves", "then", "thence", "thenceforth", "there", "thereabout", "thereabouts", | |
| 55 | + "thereafter", "thereby", "therefore", "therein", "thereof", "thereon", "thereto", "thereupon", | |
| 56 | + "these", "they", "this", "those", "thou", "though", "thrice", "through", "throughout", "thru", | |
| 57 | + "thus", "thy", "thyself", "till", "to", "together", "too", "toward", "towards", "ugh", | |
| 58 | + "unable", "under", "underneath", "unless", "unlike", "until", "up", "upon", "upward", | |
| 59 | + "upwards", "us", "use", "used", "using", "very", "via", "vs", "want", "was", "we", "week", | |
| 60 | + "well", "were", "what", "whatever", "whatsoever", "when", "whence", "whenever", "whensoever", | |
| 61 | + "where", "whereabouts", "whereafter", "whereas", "whereat", "whereby", "wherefore", | |
| 62 | + "wherefrom", "wherein", "whereinto", "whereof", "whereon", "wheresoever", "whereto", | |
| 63 | + "whereunto", "whereupon", "wherever", "wherewith", "whether", "whew", "which", "whichever", | |
| 64 | + "whichsoever", "while", "whilst", "whither", "who", "whoa", "whoever", "whole", "whom", | |
| 65 | + "whomever", "whomsoever", "whose", "whosoever", "why", "will", "wilt", "with", "within", | |
| 66 | + "without", "worse", "worst", "would", "wow", "ye", "yet", "year", "yippee", "you", "your", | |
| 67 | + "yours", "yourself", "yourselves" | |
| 68 | + ] | |
| 69 | + | |
| 70 | + | |
| 71 | +end | |
| 72 | + | |
| 73 | +# Extention of the standard class String with useful function. | |
| 74 | +class String | |
| 75 | + include Mirimiri | |
| 76 | + | |
| 77 | + # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise. | |
| 78 | + def is_stopword? | |
| 79 | + Stoplist.include?(self.downcase) | |
| 80 | + end | |
| 81 | + | |
| 82 | + # Do not use. | |
| 83 | + # TODO: rewamp. find why this function is here. | |
| 84 | + def remove_special_characters | |
| 85 | + self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ') | |
| 86 | + end | |
| 87 | + | |
| 88 | + # Removes all XML-like tags from +self+. | |
| 89 | + # | |
| 90 | + # s = "<html><body>test</body></html>" | |
| 91 | + # s.strip_xml_tags! | |
| 92 | + # s #=> "test" | |
| 93 | + def strip_xml_tags! | |
| 94 | + replace strip_with_pattern /<\/?[^>]*>/ | |
| 95 | + end | |
| 96 | + | |
| 97 | + # Removes all XML-like tags from +self+. | |
| 98 | + # | |
| 99 | + # s = "<html><body>test</body></html>" | |
| 100 | + # s.strip_xml_tags #=> "test" | |
| 101 | + # s #=> "<html><body>test</body></html>" | |
| 102 | + def strip_xml_tags | |
| 103 | + dup.strip_xml_tags! | |
| 104 | + end | |
| 105 | + | |
| 106 | + # Removes all Javascript sources from +self+. | |
| 107 | + # | |
| 108 | + # s = "<script type='text/javascript'> | |
| 109 | + # var skin='vector', | |
| 110 | + # stylepath='http://bits.wikimedia.org/skins-1.5' | |
| 111 | + # </script> | |
| 112 | + # | |
| 113 | + # test" | |
| 114 | + # s.strip_javascripts! | |
| 115 | + # s #=> "test" | |
| 116 | + def strip_javascripts! | |
| 117 | + replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m | |
| 118 | + end | |
| 119 | + | |
| 120 | + # Removes all Javascript sources from +self+. | |
| 121 | + # | |
| 122 | + # s = "<script type='text/javascript'> | |
| 123 | + # var skin='vector', | |
| 124 | + # stylepath='http://bits.wikimedia.org/skins-1.5' | |
| 125 | + # </script> | |
| 126 | + # | |
| 127 | + # test" | |
| 128 | + # s.strip_javascripts #=> "test" | |
| 129 | + def strip_javascripts | |
| 130 | + dup.strip_javascripts! | |
| 131 | + end | |
| 132 | + | |
| 133 | + def strip_stylesheets! | |
| 134 | + # TODO: rewamp. dunno what is it. | |
| 135 | + replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m | |
| 136 | + end | |
| 137 | + | |
| 138 | + def strip_stylesheets | |
| 139 | + dup.strip_stylesheets! | |
| 140 | + end | |
| 141 | + | |
| 142 | + # Removes punctuation from +self+. | |
| 143 | + # | |
| 144 | + # s = "hello, world. how are you?!" | |
| 145 | + # s.strip_punctuation! | |
| 146 | + # s # => "hello world how are you" | |
| 147 | + def strip_punctuation! | |
| 148 | + replace strip_with_pattern /[^a-zA-Z0-9\-\s]/ | |
| 149 | + end | |
| 150 | + | |
| 151 | + # Removes punctuation from +self+. | |
| 152 | + # | |
| 153 | + # s = "hello, world. how are you?!" | |
| 154 | + # s.strip_punctuation # => "hello world how are you" | |
| 155 | + def strip_punctuation | |
| 156 | + dup.strip_punctuation! | |
| 157 | + end | |
| 158 | + | |
| 159 | + # Returns the text values inside all occurences of a XML tag in +self+ | |
| 160 | + # | |
| 161 | + # s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre" | |
| 162 | + # s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"] | |
| 163 | + def extract_xmltags_values(tag_name) | |
| 164 | + self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten | |
| 165 | + end | |
| 166 | + | |
| 167 | + def strip_with_pattern(pattern) | |
| 168 | + require 'cgi' | |
| 169 | + require 'kconv' | |
| 170 | + CGI::unescapeHTML(self.gsub(pattern,"")).toutf8 | |
| 171 | + end | |
| 172 | + | |
| 173 | + private :strip_with_pattern | |
| 174 | +end |
lib/mirimiri/ttagger.rb
| 1 | +#!/usr/bin/env ruby | |
| 2 | + | |
| 3 | +#-- | |
| 4 | +# This file is a part of the mirimiri library | |
| 5 | +# | |
| 6 | +# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com> | |
| 7 | +# | |
| 8 | +# This program is free software: you can redistribute it and/or modify | |
| 9 | +# it under the terms of the GNU General Public License as published by | |
| 10 | +# the Free Software Foundation, either version 3 of the License, or | |
| 11 | +# (at your option) any later version. | |
| 12 | +# | |
| 13 | +# This program is distributed in the hope that it will be useful, | |
| 14 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 15 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 16 | +# GNU General Public License for more details. | |
| 17 | +# | |
| 18 | +# You should have received a copy of the GNU General Public License | |
| 19 | +# along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| 20 | +#++ | |
| 21 | + | |
| 22 | + | |
| 23 | +# TreeTagger-related stuff module. | |
| 24 | +# | |
| 25 | +# See http://www.ims.uni-stuttgart.de/projekte/corplex/TreeTagger/DecisionTreeTagger.html | |
| 26 | +module TreeTagger | |
| 27 | + | |
| 28 | + # This class handles generic parsing of tagger-chunker outputs. | |
| 29 | + class TaggerChunker | |
| 30 | + attr_reader :chunks, :file | |
| 31 | + | |
| 32 | + | |
| 33 | + # Parses a tagger-chunker output and returns an Array of Chunk. | |
| 34 | + def self.parse chunk_lines | |
| 35 | + open = false | |
| 36 | + tag = nil | |
| 37 | + | |
| 38 | + chunks = [] | |
| 39 | + words = [] | |
| 40 | + | |
| 41 | + chunk_lines.each do |l| | |
| 42 | + l.chomp! | |
| 43 | + if l =~ /^<\w+>$/ | |
| 44 | + open = true | |
| 45 | + tag = l | |
| 46 | + elsif l =~ /^<\/\w+>$/ | |
| 47 | + if !words.empty? && open && l == tag.sub(/</, '</') | |
| 48 | + open = false | |
| 49 | + chunks.push Chunk.new(words.join(" "), tag) | |
| 50 | + words.clear | |
| 51 | + else | |
| 52 | + next | |
| 53 | + end | |
| 54 | + else | |
| 55 | + words.push(l.split.first) | |
| 56 | + end | |
| 57 | + end | |
| 58 | + | |
| 59 | + chunks | |
| 60 | + end | |
| 61 | + | |
| 62 | + # Initializes parsing. +chunk_file+ is the output of +tagger-chunker-+ and must | |
| 63 | + # be a valid path to the file. | |
| 64 | + # | |
| 65 | + # TaggerChunker.new("ttout/2010020") #=> #<RIR::TreeTagger::TaggerChunker:0x92fd088 @chunks=[#<RIR::TreeTagger::Chunk:0x8ec5a10 @words=["robert", "schumann"], @tag="NC">, ...] ...> | |
| 66 | + def initialize chunk_file | |
| 67 | + @chunks = TaggerChunker.parse File.open(chunk_file).readlines | |
| 68 | + end | |
| 69 | + | |
| 70 | + end | |
| 71 | + | |
| 72 | + class TaggerChunkerEnglish < TaggerChunker | |
| 73 | + end | |
| 74 | + | |
| 75 | + class TaggerChunkerFrench < TaggerChunker | |
| 76 | + end | |
| 77 | + | |
| 78 | + class TaggerChunkerGerman < TaggerChunker | |
| 79 | + end | |
| 80 | + | |
| 81 | + # Represents a Chunk extracted when parsing a TaggerChunker file. | |
| 82 | + class Chunk | |
| 83 | + attr_reader :words, :tag | |
| 84 | + | |
| 85 | + # Creates a Chunk. | |
| 86 | + # | |
| 87 | + # * +str+ are whitespace-separated terms. | |
| 88 | + # * +tag+ see : ftp://ftp.ims.uni-stuttgart.de/pub/corpora/chunker-tagset-english.txt | |
| 89 | + def initialize str,tag | |
| 90 | + @words = str.split | |
| 91 | + @tag = tag[1..-2] | |
| 92 | + end | |
| 93 | + end | |
| 94 | + | |
| 95 | +end |