Commit 7f3e958ff8a78b1f8251027260efc357b83740cc
0 parents
Exists in
master
first commit. stand-alone functions build on top of mirimiri and lda-ruby to ext…
…ract contextual features from general corpora.
Showing 2 changed files with 82 additions and 0 deletions Side-by-side Diff
README.markdown
... | ... | @@ -0,0 +1,19 @@ |
1 | +# context | |
2 | + | |
3 | +Copyright (C) 2012 Romain Deveaud <romain.deveaud@gmail.com> | |
4 | + | |
5 | +License | |
6 | +======= | |
7 | + | |
8 | +This program is free software: you can redistribute it and/or modify | |
9 | +it under the terms of the GNU General Public License as published by | |
10 | +the Free Software Foundation, either version 3 of the License, or | |
11 | +(at your option) any later version. | |
12 | + | |
13 | +This program is distributed in the hope that it will be useful, | |
14 | +but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 | +GNU General Public License for more details. | |
17 | + | |
18 | +You should have received a copy of the GNU General Public License | |
19 | +along with this program. If not, see <http://www.gnu.org/licenses/>. |
context.rb
... | ... | @@ -0,0 +1,63 @@ |
1 | +#!/usr/bin/env ruby | |
2 | + | |
3 | +require 'mirimiri' | |
4 | +require 'sanitize' | |
5 | +require 'lda-ruby' | |
6 | + | |
7 | +module Context | |
8 | + IndexPaths = { | |
9 | + :web_en => '/mnt/disk2/ClueWeb09_English_1_sDocs', | |
10 | + :web_fr => '/mnt/disk2/ClueWeb09_French_1_sDocs', | |
11 | + :web_nospam => '/mnt/disk1/ClueWeb09_English_1noSpam', | |
12 | + :gigaword => '/local/data/GigaWord/index', | |
13 | + :nyt => '/local/data/NYT_index', | |
14 | + :wiki_en => '/local/data/WikiEn_index', | |
15 | + :wiki_fr => '/local/data/WikiFr_index' | |
16 | + } | |
17 | + | |
18 | + def Context.term_context index_path,query,size,num_page,args={} | |
19 | + args[:func] ||= :entropy | |
20 | + args[:window] ||= 1 | |
21 | + | |
22 | + docs = self.feedback_docs index_path,query,num_page | |
23 | + | |
24 | + resource = Mirimiri::Document.new docs.join(' ') | |
25 | + terms = self.extract_ngrams resource,args[:func].to_sym,args[:window] | |
26 | + | |
27 | + context = "#weight ( #{terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| "#{e[0]} #1(#{e[1]})" }.join " "} ) " unless terms.empty? | |
28 | + | |
29 | + context | |
30 | + end | |
31 | + | |
32 | + def Context.topic_context index_path,query,size,num_page,args={} | |
33 | + corpus = Lda::Corpus.new | |
34 | + | |
35 | + docs = self.feedback_docs index_path,query,num_page | |
36 | + docs.each do |d| | |
37 | + doc = Lda::TextDocument.new corpus,d | |
38 | + corpus.add_document doc | |
39 | + end | |
40 | + | |
41 | + lda = Lda::Lda.new corpus | |
42 | + lda.num_topics = num_page/10 | |
43 | + lda.em 'random' | |
44 | + puts lda.top_words(size) | |
45 | + end | |
46 | + | |
47 | + private | |
48 | + def Context.feedback_docs index_path,query,num_page | |
49 | + query = Indri::IndriQuery.new({:query => query, :count => num_page},"-printDocuments=true -trecFormat=true") | |
50 | + index = Indri::IndriIndex.new index_path | |
51 | + idocs = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8")) | |
52 | + | |
53 | + docs = idocs.extract_docs.collect { |idoc| Sanitize.clean idoc,:remove_contents => ['script'] } | |
54 | + docs | |
55 | + end | |
56 | + | |
57 | + def Context.extract_ngrams resource,func,n | |
58 | + raw_terms = 1.upto(n).collect { |i| resource.ngrams(i) }.flatten | |
59 | + terms = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.all? { |e| e.length <= 1 } || w.split.all? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") } | |
60 | + terms | |
61 | + end | |
62 | + | |
63 | +end |