Deveaud Romain / context

Commit 7f3e958ff8a78b1f8251027260efc357b83740cc

Authored by Romain Deveaud 2012-03-08 19:20:33 +0100

0 parents

Exists in master

first commit. stand-alone functions build on top of mirimiri and lda-ruby to ext…

…ract contextual features from general corpora.

Showing 2 changed files with 82 additions and 0 deletions Inline Diff

README.markdown
context.rb

README.markdown

Diff comments View file @ 7f3e958

File was created	1	# context
	2
	3	Copyright (C) 2012 Romain Deveaud <romain.deveaud@gmail.com>
	4
	5	License
	6	=======
	7
	8	This program is free software: you can redistribute it and/or modify
	9	it under the terms of the GNU General Public License as published by
	10	the Free Software Foundation, either version 3 of the License, or
	11	(at your option) any later version.
	12
	13	This program is distributed in the hope that it will be useful,
	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	GNU General Public License for more details.
	17
	18	You should have received a copy of the GNU General Public License
	19	along with this program. If not, see <http://www.gnu.org/licenses/>.
	20

context.rb

Diff comments View file @ 7f3e958

File was created	1	#!/usr/bin/env ruby
	2
	3	require 'mirimiri'
	4	require 'sanitize'
	5	require 'lda-ruby'
	6
	7	module Context
	8	IndexPaths = {
	9	:web_en => '/mnt/disk2/ClueWeb09_English_1_sDocs',
	10	:web_fr => '/mnt/disk2/ClueWeb09_French_1_sDocs',
	11	:web_nospam => '/mnt/disk1/ClueWeb09_English_1noSpam',
	12	:gigaword => '/local/data/GigaWord/index',
	13	:nyt => '/local/data/NYT_index',
	14	:wiki_en => '/local/data/WikiEn_index',
	15	:wiki_fr => '/local/data/WikiFr_index'
	16	}
	17
	18	def Context.term_context index_path,query,size,num_page,args={}
	19	args[:func] \|\|= :entropy
	20	args[:window] \|\|= 1
	21
	22	docs = self.feedback_docs index_path,query,num_page
	23
	24	resource = Mirimiri::Document.new docs.join(' ')
	25	terms = self.extract_ngrams resource,args[:func].to_sym,args[:window]
	26
	27	context = "#weight ( #{terms.compact.sort{ \|a,b\| b[0] <=> a[0]}[0,size].collect { \|e\| "#{e[0]} #1(#{e[1]})" }.join " "} ) " unless terms.empty?
	28
	29	context
	30	end
	31
	32	def Context.topic_context index_path,query,size,num_page,args={}
	33	corpus = Lda::Corpus.new
	34
	35	docs = self.feedback_docs index_path,query,num_page
	36	docs.each do \|d\|
	37	doc = Lda::TextDocument.new corpus,d
	38	corpus.add_document doc
	39	end
	40
	41	lda = Lda::Lda.new corpus
	42	lda.num_topics = num_page/10
	43	lda.em 'random'
	44	puts lda.top_words(size)
	45	end
	46
	47	private
	48	def Context.feedback_docs index_path,query,num_page
	49	query = Indri::IndriQuery.new({:query => query, :count => num_page},"-printDocuments=true -trecFormat=true")
	50	index = Indri::IndriIndex.new index_path
	51	idocs = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))
	52
	53	docs = idocs.extract_docs.collect { \|idoc\| Sanitize.clean idoc,:remove_contents => ['script'] }
	54	docs
	55	end
	56
	57	def Context.extract_ngrams resource,func,n
	58	raw_terms = 1.upto(n).collect { \|i\| resource.ngrams(i) }.flatten
	59	terms = raw_terms.uniq.collect { \|w\| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? \|\| w.split.all? { \|e\| e.length <= 1 } \|\| w.split.all? { \|e\| e !~ /[a-zA-Z]/ } \|\| w.include?(".") }
	60	terms
	61	end
	62
	63	end
	64