Commit 7f3e958ff8a78b1f8251027260efc357b83740cc

Authored by Romain Deveaud
0 parents
Exists in master

first commit. stand-alone functions build on top of mirimiri and lda-ruby to ext…

…ract contextual features from general corpora.

Showing 2 changed files with 82 additions and 0 deletions Side-by-side Diff

... ... @@ -0,0 +1,19 @@
  1 +# context
  2 +
  3 +Copyright (C) 2012 Romain Deveaud <romain.deveaud@gmail.com>
  4 +
  5 +License
  6 +=======
  7 +
  8 +This program is free software: you can redistribute it and/or modify
  9 +it under the terms of the GNU General Public License as published by
  10 +the Free Software Foundation, either version 3 of the License, or
  11 +(at your option) any later version.
  12 +
  13 +This program is distributed in the hope that it will be useful,
  14 +but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 +GNU General Public License for more details.
  17 +
  18 +You should have received a copy of the GNU General Public License
  19 +along with this program. If not, see <http://www.gnu.org/licenses/>.
... ... @@ -0,0 +1,63 @@
  1 +#!/usr/bin/env ruby
  2 +
  3 +require 'mirimiri'
  4 +require 'sanitize'
  5 +require 'lda-ruby'
  6 +
  7 +module Context
  8 + IndexPaths = {
  9 + :web_en => '/mnt/disk2/ClueWeb09_English_1_sDocs',
  10 + :web_fr => '/mnt/disk2/ClueWeb09_French_1_sDocs',
  11 + :web_nospam => '/mnt/disk1/ClueWeb09_English_1noSpam',
  12 + :gigaword => '/local/data/GigaWord/index',
  13 + :nyt => '/local/data/NYT_index',
  14 + :wiki_en => '/local/data/WikiEn_index',
  15 + :wiki_fr => '/local/data/WikiFr_index'
  16 + }
  17 +
  18 + def Context.term_context index_path,query,size,num_page,args={}
  19 + args[:func] ||= :entropy
  20 + args[:window] ||= 1
  21 +
  22 + docs = self.feedback_docs index_path,query,num_page
  23 +
  24 + resource = Mirimiri::Document.new docs.join(' ')
  25 + terms = self.extract_ngrams resource,args[:func].to_sym,args[:window]
  26 +
  27 + context = "#weight ( #{terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| "#{e[0]} #1(#{e[1]})" }.join " "} ) " unless terms.empty?
  28 +
  29 + context
  30 + end
  31 +
  32 + def Context.topic_context index_path,query,size,num_page,args={}
  33 + corpus = Lda::Corpus.new
  34 +
  35 + docs = self.feedback_docs index_path,query,num_page
  36 + docs.each do |d|
  37 + doc = Lda::TextDocument.new corpus,d
  38 + corpus.add_document doc
  39 + end
  40 +
  41 + lda = Lda::Lda.new corpus
  42 + lda.num_topics = num_page/10
  43 + lda.em 'random'
  44 + puts lda.top_words(size)
  45 + end
  46 +
  47 + private
  48 + def Context.feedback_docs index_path,query,num_page
  49 + query = Indri::IndriQuery.new({:query => query, :count => num_page},"-printDocuments=true -trecFormat=true")
  50 + index = Indri::IndriIndex.new index_path
  51 + idocs = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))
  52 +
  53 + docs = idocs.extract_docs.collect { |idoc| Sanitize.clean idoc,:remove_contents => ['script'] }
  54 + docs
  55 + end
  56 +
  57 + def Context.extract_ngrams resource,func,n
  58 + raw_terms = 1.upto(n).collect { |i| resource.ngrams(i) }.flatten
  59 + terms = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.all? { |e| e.length <= 1 } || w.split.all? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") }
  60 + terms
  61 + end
  62 +
  63 +end