Blame view
context.rb
2.11 KB
7f3e958ff first commit. sta... |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
#!/usr/bin/env ruby require 'mirimiri' require 'sanitize' require 'lda-ruby' module Context IndexPaths = { :web_en => '/mnt/disk2/ClueWeb09_English_1_sDocs', :web_fr => '/mnt/disk2/ClueWeb09_French_1_sDocs', :web_nospam => '/mnt/disk1/ClueWeb09_English_1noSpam', :gigaword => '/local/data/GigaWord/index', :nyt => '/local/data/NYT_index', :wiki_en => '/local/data/WikiEn_index', :wiki_fr => '/local/data/WikiFr_index' } def Context.term_context index_path,query,size,num_page,args={} args[:func] ||= :entropy args[:window] ||= 1 docs = self.feedback_docs index_path,query,num_page resource = Mirimiri::Document.new docs.join(' ') terms = self.extract_ngrams resource,args[:func].to_sym,args[:window] context = "#weight ( #{terms.compact.sort{ |a,b| b[0] <=> a[0]}[0,size].collect { |e| "#{e[0]} #1(#{e[1]})" }.join " "} ) " unless terms.empty? context end def Context.topic_context index_path,query,size,num_page,args={} corpus = Lda::Corpus.new docs = self.feedback_docs index_path,query,num_page docs.each do |d| doc = Lda::TextDocument.new corpus,d corpus.add_document doc end lda = Lda::Lda.new corpus lda.num_topics = num_page/10 lda.em 'random' puts lda.top_words(size) end private def Context.feedback_docs index_path,query,num_page query = Indri::IndriQuery.new({:query => query, :count => num_page},"-printDocuments=true -trecFormat=true") index = Indri::IndriIndex.new index_path idocs = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8")) docs = idocs.extract_docs.collect { |idoc| Sanitize.clean idoc,:remove_contents => ['script'] } docs end def Context.extract_ngrams resource,func,n raw_terms = 1.upto(n).collect { |i| resource.ngrams(i) }.flatten terms = raw_terms.uniq.collect { |w| [resource.send(func.to_sym,w), w.unaccent] unless w.is_stopword? || w.split.all? { |e| e.length <= 1 } || w.split.all? { |e| e !~ /[a-zA-Z]/ } || w.include?(".") } terms end end |