context.rb
2.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env ruby
require 'mirimiri'
require 'sanitize'
require 'lda-ruby'
module Context
IndexPaths = {
:web_en => '/mnt/disk2/ClueWeb09_English_1_sDocs',
:web_fr => '/mnt/disk2/ClueWeb09_French_1_sDocs',
:web_nospam => '/mnt/disk1/ClueWeb09_English_1noSpam',
:gigaword => '/local/data/GigaWord/index',
:nyt => '/local/data/NYT_index',
:wiki_en => '/local/data/WikiEn_index',
:wiki_fr => '/local/data/WikiFr_index'
}
def Context.term_context index_path,query,size,num_page,args={}
args[:func] ||= :entropy
args[:window] ||= 1
docs = self.feedback_docs index_path,query,num_page
resource = Mirimiri::Document.new docs.join(' ')
terms = self.extract_ngrams resource,args[:func].to_sym,args[:window]
context = "#weight ( #{terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| "#{e[0]} #1(#{e[1]})" }.join " "} ) " unless terms.empty?
context
end
def Context.topic_context index_path,query,size,num_page,args={}
corpus = Lda::Corpus.new
docs = self.feedback_docs index_path,query,num_page
docs.each do |d|
doc = Lda::TextDocument.new corpus,d
corpus.add_document doc
end
lda = Lda::Lda.new corpus
lda.num_topics = num_page/10
lda.em 'random'
puts lda.top_words(size)
end
private
def Context.feedback_docs index_path,query,num_page
query = Indri::IndriQuery.new({:query => query, :count => num_page},"-printDocuments=true -trecFormat=true")
index = Indri::IndriIndex.new index_path
idocs = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))
docs = idocs.extract_docs.collect { |idoc| Sanitize.clean idoc,:remove_contents => ['script'] }
docs
end
def Context.extract_ngrams resource,func,n
raw_terms = 1.upto(n).collect { |i| resource.ngrams(i) }.flatten
terms = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.all? { |e| e.length <= 1 } || w.split.all? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") }
terms
end
end