Commit 7f3e958ff8a78b1f8251027260efc357b83740cc

Authored by Romain Deveaud
0 parents
Exists in master

first commit. stand-alone functions build on top of mirimiri and lda-ruby to ext…

…ract contextual features from general corpora.

Showing 2 changed files with 82 additions and 0 deletions Inline Diff

File was created 1 # context
2
3 Copyright (C) 2012 Romain Deveaud <romain.deveaud@gmail.com>
4
5 License
6 =======
7
8 This program is free software: you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation, either version 3 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
20
File was created 1 #!/usr/bin/env ruby
2
3 require 'mirimiri'
4 require 'sanitize'
5 require 'lda-ruby'
6
7 module Context
8 IndexPaths = {
9 :web_en => '/mnt/disk2/ClueWeb09_English_1_sDocs',
10 :web_fr => '/mnt/disk2/ClueWeb09_French_1_sDocs',
11 :web_nospam => '/mnt/disk1/ClueWeb09_English_1noSpam',
12 :gigaword => '/local/data/GigaWord/index',
13 :nyt => '/local/data/NYT_index',
14 :wiki_en => '/local/data/WikiEn_index',
15 :wiki_fr => '/local/data/WikiFr_index'
16 }
17
18 def Context.term_context index_path,query,size,num_page,args={}
19 args[:func] ||= :entropy
20 args[:window] ||= 1
21
22 docs = self.feedback_docs index_path,query,num_page
23
24 resource = Mirimiri::Document.new docs.join(' ')
25 terms = self.extract_ngrams resource,args[:func].to_sym,args[:window]
26
27 context = "#weight ( #{terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| "#{e[0]} #1(#{e[1]})" }.join " "} ) " unless terms.empty?
28
29 context
30 end
31
32 def Context.topic_context index_path,query,size,num_page,args={}
33 corpus = Lda::Corpus.new
34
35 docs = self.feedback_docs index_path,query,num_page
36 docs.each do |d|
37 doc = Lda::TextDocument.new corpus,d
38 corpus.add_document doc
39 end
40
41 lda = Lda::Lda.new corpus
42 lda.num_topics = num_page/10
43 lda.em 'random'
44 puts lda.top_words(size)
45 end
46
47 private
48 def Context.feedback_docs index_path,query,num_page
49 query = Indri::IndriQuery.new({:query => query, :count => num_page},"-printDocuments=true -trecFormat=true")
50 index = Indri::IndriIndex.new index_path
51 idocs = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))
52
53 docs = idocs.extract_docs.collect { |idoc| Sanitize.clean idoc,:remove_contents => ['script'] }
54 docs
55 end
56
57 def Context.extract_ngrams resource,func,n
58 raw_terms = 1.upto(n).collect { |i| resource.ngrams(i) }.flatten
59 terms = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.all? { |e| e.length <= 1 } || w.split.all? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") }
60 terms
61 end
62
63 end
64