Commit 7f3e958ff8a78b1f8251027260efc357b83740cc
0 parents
Exists in
master
first commit. stand-alone functions build on top of mirimiri and lda-ruby to ext…
…ract contextual features from general corpora.
Showing 2 changed files with 82 additions and 0 deletions Inline Diff
README.markdown
File was created | 1 | # context | |
2 | |||
3 | Copyright (C) 2012 Romain Deveaud <romain.deveaud@gmail.com> | ||
4 | |||
5 | License | ||
6 | ======= | ||
7 | |||
8 | This program is free software: you can redistribute it and/or modify | ||
9 | it under the terms of the GNU General Public License as published by | ||
10 | the Free Software Foundation, either version 3 of the License, or | ||
11 | (at your option) any later version. | ||
12 | |||
13 | This program is distributed in the hope that it will be useful, | ||
14 | but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | GNU General Public License for more details. | ||
17 | |||
18 | You should have received a copy of the GNU General Public License | ||
19 | along with this program. If not, see <http://www.gnu.org/licenses/>. | ||
20 |
context.rb
File was created | 1 | #!/usr/bin/env ruby | |
2 | |||
3 | require 'mirimiri' | ||
4 | require 'sanitize' | ||
5 | require 'lda-ruby' | ||
6 | |||
7 | module Context | ||
8 | IndexPaths = { | ||
9 | :web_en => '/mnt/disk2/ClueWeb09_English_1_sDocs', | ||
10 | :web_fr => '/mnt/disk2/ClueWeb09_French_1_sDocs', | ||
11 | :web_nospam => '/mnt/disk1/ClueWeb09_English_1noSpam', | ||
12 | :gigaword => '/local/data/GigaWord/index', | ||
13 | :nyt => '/local/data/NYT_index', | ||
14 | :wiki_en => '/local/data/WikiEn_index', | ||
15 | :wiki_fr => '/local/data/WikiFr_index' | ||
16 | } | ||
17 | |||
18 | def Context.term_context index_path,query,size,num_page,args={} | ||
19 | args[:func] ||= :entropy | ||
20 | args[:window] ||= 1 | ||
21 | |||
22 | docs = self.feedback_docs index_path,query,num_page | ||
23 | |||
24 | resource = Mirimiri::Document.new docs.join(' ') | ||
25 | terms = self.extract_ngrams resource,args[:func].to_sym,args[:window] | ||
26 | |||
27 | context = "#weight ( #{terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| "#{e[0]} #1(#{e[1]})" }.join " "} ) " unless terms.empty? | ||
28 | |||
29 | context | ||
30 | end | ||
31 | |||
32 | def Context.topic_context index_path,query,size,num_page,args={} | ||
33 | corpus = Lda::Corpus.new | ||
34 | |||
35 | docs = self.feedback_docs index_path,query,num_page | ||
36 | docs.each do |d| | ||
37 | doc = Lda::TextDocument.new corpus,d | ||
38 | corpus.add_document doc | ||
39 | end | ||
40 | |||
41 | lda = Lda::Lda.new corpus | ||
42 | lda.num_topics = num_page/10 | ||
43 | lda.em 'random' | ||
44 | puts lda.top_words(size) | ||
45 | end | ||
46 | |||
47 | private | ||
48 | def Context.feedback_docs index_path,query,num_page | ||
49 | query = Indri::IndriQuery.new({:query => query, :count => num_page},"-printDocuments=true -trecFormat=true") | ||
50 | index = Indri::IndriIndex.new index_path | ||
51 | idocs = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8")) | ||
52 | |||
53 | docs = idocs.extract_docs.collect { |idoc| Sanitize.clean idoc,:remove_contents => ['script'] } | ||
54 | docs | ||
55 | end | ||
56 | |||
57 | def Context.extract_ngrams resource,func,n | ||
58 | raw_terms = 1.upto(n).collect { |i| resource.ngrams(i) }.flatten | ||
59 | terms = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.all? { |e| e.length <= 1 } || w.split.all? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") } | ||
60 | terms | ||
61 | end | ||
62 | |||
63 | end | ||
64 |