Deveaud Romain / mirimiri

Browse Code »

Commit e0e33fca06e4913aefe250d4d9458464a250052e

Authored by Romain Deveaud 2012-03-08 08:50:39 +0100

1 parent aa386f5530

Exists in master

new way of querying indri. entropy of n-grams. sdm is now part of the string class.

Showing 6 changed files with 95 additions and 16 deletions Inline Diff

lib/mirimiri.rb
lib/mirimiri/document.rb
lib/mirimiri/index.rb
lib/mirimiri/query.rb
lib/mirimiri/string.rb
main.rb

lib/mirimiri.rb

Diff comments View file @ e0e33fc

1	#!/usr/bin/env ruby	1	#!/usr/bin/env ruby
2		2
3	require 'mirimiri/document'	3	require 'mirimiri/document'
4	require 'mirimiri/string'	4	require 'mirimiri/string'
		5	require 'mirimiri/result'
5	require 'mirimiri/query'	6	require 'mirimiri/query'
		7	require 'mirimiri/index'
6	require 'mirimiri/corpus'	8	require 'mirimiri/corpus'
7	require 'mirimiri/regexp'	9	require 'mirimiri/regexp'
8	require 'mirimiri/ttagger'	10	require 'mirimiri/ttagger'
9		11

lib/mirimiri/document.rb

Diff comments View file @ e0e33fc

1	#!/usr/bin/env ruby	1	#!/usr/bin/env ruby
2		2
3	#--	3	#--
4	# This file is a part of the mirimiri library	4	# This file is a part of the mirimiri library
5	#	5	#
6	# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>	6	# Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
7	#	7	#
8	# This program is free software: you can redistribute it and/or modify	8	# This program is free software: you can redistribute it and/or modify
9	# it under the terms of the GNU General Public License as published by	9	# it under the terms of the GNU General Public License as published by
10	# the Free Software Foundation, either version 3 of the License, or	10	# the Free Software Foundation, either version 3 of the License, or
11	# (at your option) any later version.	11	# (at your option) any later version.
12	#	12	#
13	# This program is distributed in the hope that it will be useful,	13	# This program is distributed in the hope that it will be useful,
14	# but WITHOUT ANY WARRANTY; without even the implied warranty of	14	# but WITHOUT ANY WARRANTY; without even the implied warranty of
15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	15	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16	# GNU General Public License for more details.	16	# GNU General Public License for more details.
17	#	17	#
18	# You should have received a copy of the GNU General Public License	18	# You should have received a copy of the GNU General Public License
19	# along with this program. If not, see <http://www.gnu.org/licenses/>.	19	# along with this program. If not, see <http://www.gnu.org/licenses/>.
20	#++	20	#++
21		21
22		22
23	# General module	23	# General module
24	module Mirimiri	24	module Mirimiri
25		25
26	# A Document is a bag of words and is constructed from a string.	26	# A Document is a bag of words and is constructed from a string.
27	class Document	27	class Document
28	attr_reader :words, :doc_content, :count_words	28	attr_reader :words, :doc_content, :count_words
29		29
30	# Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html	30	# Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
31	# and the \\W special escape).	31	# and the \\W special escape).
32	#	32	#
33	# Protected function, only meant to by called at the initialization.	33	# Protected function, only meant to by called at the initialization.
34	def format_words	34	def format_words
35	wo = []	35	wo = []
36		36
37	@doc_content.split.each do \|w\|	37	@doc_content.split.each do \|w\|
38	w.split(/\W/).each do \|sw\|	38	w.split(/\W/).each do \|sw\|
39	wo.push(sw.downcase) if sw =~ /[a-zA-Z]/	39	wo.push(sw.downcase) if sw =~ /[a-zA-Z]/
40	end	40	end
41	end	41	end
42		42
43	wo	43	wo
44	end	44	end
45		45
46	# Returns an Array containing the +n+-grams (words) from the current Document.	46	# Returns an Array containing the +n+-grams (words) from the current Document.
47	#	47	#
48	# ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]	48	# ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
49	def ngrams(n)	49	def ngrams(n)
50	window = []	50	window = []
51	ngrams_array = []	51	ngrams_array = []
52		52
53	@words.each do \|w\|	53	@words.each do \|w\|
54	window.push(w)	54	window.push(w)
55	if window.size == n	55	if window.size == n
56	ngrams_array.push window.join(" ")	56	ngrams_array.push window.join(" ")
57	window.delete_at(0)	57	window.delete_at(0)
58	end	58	end
59	end	59	end
60		60
61	ngrams_array.uniq	61	ngrams_array
62	end	62	end
63		63
64	# Returns a Hash containing the words and their associated counts in the current Document.	64	# Returns a Hash containing the words and their associated counts in the current Document.
65	#	65	#
66	# count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }	66	# count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
67	def count_words	67	def count_words
68	counts = Hash.new { \|h,k\| h[k] = 0 }	68	counts = Hash.new { \|h,k\| h[k] = 0 }
69	@words.each { \|w\| counts[w] += 1 }	69	@words.each { \|w\| counts[w] += 1 }
70		70
71	counts	71	counts
72	end	72	end
73		73
		74	# Old entropy function.
		75	# TODO: remove.
		76	def entropy0(s)
		77	en = 0.0
		78
		79	s.split.each do \|w\|
		80	p_wi = @count_words[w].to_f/@words.count.to_f
		81	en += p_wi*Math.log2(p_wi)
		82	end
		83
		84	en *= -1
		85	en
		86	end
		87
74	# Computes the entropy of a given string +s+ inside the document.	88	# Computes the entropy of a given string +s+ inside the document.
75	#	89	#
76	# If the string parameter is composed of many words (i.e. tokens separated	90	# If the string parameter is composed of many words (i.e. tokens separated
77	# by whitespace(s)), it is considered as an ngram.	91	# by whitespace(s)), it is considered as an ngram.
78	#	92	#
79	# entropy("guitar") #=> 0.00432114812727959	93	# entropy("guitar") #=> 0.014348983965324762
80	# entropy("dillinger escape plan") #=> 0.265862076325102	94	# entropy("dillinger escape plan") #=> 0.054976093116768154
81	def entropy(s)	95	def entropy(s)
82	en = 0.0	96	en = 0.0
83		97
84	s.split.each do \|w\|	98	size = s.split.size
85	p_wi = @count_words[w].to_f/@words.count.to_f	99
86	en += p_wi*Math.log2(p_wi)	100	if size == 1
		101	p_wi = @count_words[s].to_f/@words.count.to_f
		102	en += p_wi*Math.log(p_wi)
		103	elsif size > 1
		104	ng_size = ngrams(size)
		105	p_wi = ng_size.count(s).to_f/ng_size.count.to_f
		106	en += p_wi*Math.log(p_wi)
87	end	107	end
88		108
89	en *= -1	109	en *= -1
90	en	110	en
91	end	111	end
92		112
93	# Computes the term frequency of a given word +s+.	113	# Computes the term frequency of a given word +s+.
94	#	114	#
95	# tf("guitar") #=> 0.000380372765310004	115	# tf("guitar") #=> 0.000380372765310004
96	def tf(s)	116	def tf(s)
97	@count_words[s].to_f/@words.size.to_f	117	@count_words[s].to_f/@words.size.to_f
98	end	118	end
99		119
100		120
101	def initialize(content="")	121	def initialize(content="")
102	@doc_content = content	122	@doc_content = content
103	@words = format_words	123	@words = format_words
104	@count_words = count_words	124	@count_words = count_words
105	end	125	end
106		126
107	protected :format_words, :count_words	127	protected :format_words, :count_words
108	end	128	end
109		129
110	# A WebDocument is a Document with a +url+.	130	# A WebDocument is a Document with a +url+.
111	class WebDocument < Document	131	class WebDocument < Document
112	attr_reader :url	132	attr_reader :url
113		133
114	# Returns the HTML text from the page of a given +url+.	134	# Returns the HTML text from the page of a given +url+.
115	def self.get_content(url)	135	def self.get_content(url)
116	require 'net/http'	136	require 'net/http'
117	Net::HTTP.get(URI.parse(url))	137	Net::HTTP.get(URI.parse(url))
118	end	138	end
119		139
120		140
121	# WebDocument constructor, the content of the Document is the HTML page	141	# WebDocument constructor, the content of the Document is the HTML page
122	# without the tags.	142	# without the tags.
123	def initialize(url,only_tags=nil)	143	def initialize(url,only_tags=nil)
124	require 'sanitize'	144	require 'sanitize'
125		145
126	@url = url	146	@url = url
127	content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")	147	content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")
128	super Sanitize.clean(content.unaccent.toutf8.force_encoding("UTF-8"), :remove_contents => ['script'])	148	super Sanitize.clean(content.unaccent.toutf8.force_encoding("UTF-8"), :remove_contents => ['script'])
129	end	149	end
130	end	150	end
131		151
132	# A WikipediaPage is a WebDocument.	152	# A WikipediaPage is a WebDocument.
133	class WikipediaPage < WebDocument	153	class WikipediaPage < WebDocument
134	require 'rexml/document'	154	require 'rexml/document'
135	require 'net/http'	155	require 'net/http'
136	require 'kconv'	156	require 'kconv'
137		157
138		158
139	def self.search_wikipedia_titles(name)	159	def self.search_wikipedia_titles(name)
140	raise ArgumentError, "Bad encoding", name unless name.isutf8	160	raise ArgumentError, "Bad encoding", name unless name.isutf8
141		161
142	res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).unaccent.toutf8).elements['api/query/search']	162	res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).unaccent.toutf8).elements['api/query/search']
143		163
144	res.collect { \|e\| e.attributes['title'] } unless res.nil?	164	res.collect { \|e\| e.attributes['title'] } unless res.nil?
145	end	165	end
146		166
147	def self.get_url(name)	167	def self.get_url(name)
148	raise ArgumentError, "Bad encoding", name unless name.isutf8	168	raise ArgumentError, "Bad encoding", name unless name.isutf8
149		169
150	atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).unaccent.toutf8).elements['api/query/pages/page'].attributes	170	atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).unaccent.toutf8).elements['api/query/pages/page'].attributes
151		171
152	atts['fullurl'] if atts['missing'].nil?	172	atts['fullurl'] if atts['missing'].nil?
153	end	173	end
154		174
155	def self.search_homepage(name)	175	def self.search_homepage(name)
156	title = WikipediaPage.search_wikipedia_titles name	176	title = WikipediaPage.search_wikipedia_titles name
157		177
158	WikipediaPage.get_url(title[0]) unless title.nil? \|\| title.empty?	178	WikipediaPage.get_url(title[0]) unless title.nil? \|\| title.empty?
159	end	179	end
160		180
161	def self.extract_anchors(url)	181	def self.extract_anchors(url)
162	self.get_content(url).extract_xmltags_values('p').join(' ').scan(/<a href="(.+?)" title=.?>(.+?)<\/a>/).delete_if { \|a\| a[0] =~ /^\/wiki\/.$/.negated }	182	self.get_content(url).extract_xmltags_values('p').join(' ').scan(/<a href="(.+?)" title=.?>(.+?)<\/a>/).delete_if { \|a\| a[0] =~ /^\/wiki\/.$/.negated }
163	end	183	end
164	end	184	end
165		185
166	class FreebasePage < WebDocument	186	class FreebasePage < WebDocument
167	require 'net/http'	187	require 'net/http'
168	require 'kconv'	188	require 'kconv'
169	require 'json'	189	require 'json'
170		190
171	def self.search_article_ids query,limit	191	def self.search_article_ids query,limit
172	raise ArgumentError, "Bad encoding", name unless name.isutf8	192	raise ArgumentError, "Bad encoding", name unless name.isutf8
173		193
174	JSON.parse(Net::HTTP.get( URI.parse "http://api.freebase.com/api/service/search?query=#{query.gsub(" ","+")}&limit=#{limit}" ))['result'].collect { \|a\| a['article']['id'] unless a['article'].nil? }.compact	194	JSON.parse(Net::HTTP.get( URI.parse "http://api.freebase.com/api/service/search?query=#{query.gsub(" ","+")}&limit=#{limit}" ))['result'].collect { \|a\| a['article']['id'] unless a['article'].nil? }.compact
175	end	195	end
176		196
177	def self.get_url id	197	def self.get_url id
178	"http://api.freebase.com/api/trans/raw#{id}"	198	"http://api.freebase.com/api/trans/raw#{id}"
179	end	199	end
180	end	200	end
181	end	201	end
182		202

lib/mirimiri/index.rb

Diff comments View file @ e0e33fc

 #!/usr/bin/env ruby
 #--
 # This file is a part of the mirimiri library
 #
 # Copyright (C) 2010-2012 Romain Deveaud <romain.deveaud@gmail.com>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #++
 class Index
 end
 module Indri
-  class IndriIndex
+  class IndriIndex < Index
-    def exec indriquery
+    def initialize path
+      raise ArgumentError, 'Index path does not exist' unless File.directory? path
+      @path = path
+    end
+    def runquery indriquery
       raise ArgumentError, 'Argument is not an IndriQuery' unless indriquery.is_a? Indri::IndriQuery
-      query = "IndriRunQuery -query#{indriquery.query} -index=#{@path}"
+      query = "IndriRunQuery -query=\"#{indriquery.query}\" -index=#{@path}"
       query += " -count=#{indriquery.count}" unless indriquery.count.nil?
       query += " -rule=method:#{indriquery.sm_method},#{indriquery.sm_param}:#{indriquery.sm_value}" unless indriquery.sm_method.nil?
       query += " #{indriquery.args}" unless indriquery.args.nil?
+      res = `#{query}`
+      res
     end
   end
 end

lib/mirimiri/query.rb

Diff comments View file @ e0e33fc

 #!/usr/bin/env ruby
 #--
 # This file is a part of the mirimiri library
 #
 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #++
 class Query
 end
 module Indri
   class Parameters
     attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline
     def initialize(corpus,count="1000",mem="1g",threads="1",offset="1",run_id="default",print_query=false,print_docs=false)
       @index_path  = corpus
       @memory      = mem
       @count       = count
       @threads     = threads
       @offset      = offset
       @run_id      = run_id
       @print_query = print_query ? "true" : "false"
       @print_docs  = print_docs  ? "true" : "false"
     end
     def to_s
       h = "<memory>#{@memory}</memory>\n"
       h += "<index>#{@index_path}</index>\n"
       h += "<count>#{@count}</count>\n"
       h += "<threads>#{@threads}</threads>\n"
       unless @baseline.nil?
         h += "<baseline>#{@baseline}</baseline>\n"
       else
         h += "<rule>#{@rule}</rule>\n"
       end
       h += "<trecFormat>true</trecFormat>\n"
       h += "<queryOffset>#{@offset}</queryOffset>\n"
       h += "<runID>#{@run_id}</runID>\n"
       h += "<printQuery>#{@print_query}</printQuery>\n"
       h += "<printDocuments>#{@print_docs}</printDocuments>\n"
       h
     end
   end
   class IndriQueryOld < Query
     attr_accessor :id, :query, :rule
     def initialize(id,query)
       @id     = id
       @query  = query
     end
     def to_s
       h = "<query>\n"
       h += "<number>#{@id}</number>\n"
       h += "<text>#{@query}</text>\n"
       h += "</query>\n"
       h
     end
     def exec params
       `IndriRunQuery -query='#{@query}' -index=#{params.index_path} -count=#{params.count} -rule=method:dirichlet,mu:2500 -trecFormat`
     end
   end
   class IndriQuery < Query
     attr_accessor :query, :count, :sm_method, :sm_param, :sm_value, :args
     def initialize atts={},args=nil
-      raise ArgumentError, 'Argument 1 must be a Hash' unless args.is_a? Hash
+      raise ArgumentError, 'Argument 1 must be a Hash' unless atts.is_a? Hash
       atts.each do |k,v|
         instance_variable_set("@#{k}", v) unless v.nil?
       end
-      raise ArgumentError, 'Argument 2 must be a String' unless args.is_a? String
+      raise ArgumentError, 'Argument 2 must be a String' unless (args.is_a?(String) || args.nil?)
       @args = args
     end
   end
   class IndriQueries
     attr_accessor :params, :queries
-    def initialize(params,*queries)
+    def initialize params
-      @queries = queries
+#      @queries = queries
       @params = params
+      @queries = {}
       # Here we set the default retrieval model as Language Modeling
       # with a Dirichlet smoothing at 2500.
       # TODO: maybe a Rule class...
       @params.rule  = 'method:dirichlet,mu:2500' if @params.rule.nil?
     end
+    def push id,query
+      @queries[id.to_i] = query
+    end
     def to_s
       h = "<parameters>\n"
       h += @params.to_s
-      h += @queries.collect { |q| q.to_s }.join ""
+      h += @queries.sort { |a,b| a[0] <=> b[0] }.collect do |q|
+            "<query>\n" +
+            "<number>#{q[0]}</number>\n" +
+            "<text>#{q[1]}</text>\n" +
+            "</query>\n"
+      end.join ""
+#      h += @queries.collect { |q| q.to_s }.join ""
       h += "</parameters>"
       h
     end
   end
 end

lib/mirimiri/string.rb

Diff comments View file @ e0e33fc

 #!/usr/bin/env ruby
 #--
 # This file is a part of the mirimiri library
 #
 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #++
 module Mirimiri
   # These are the default stopwords provided by Lemur.
   Stoplist = [
 "a","about","above","according","across","after","afterwards","again","against",
 "albeit","all","almost","alone","along","already","also","although","always","am",
 "among","amongst","an","and","another","any","anybody","anyhow","anyone","anything",
 "anyway","anywhere","apart","are","around","as","at","av","be","became","because",
 "become","becomes","becoming","been","before","beforehand","behind","being","below",
 "beside","besides","between","beyond","both","but","by","can","cannot","canst",
 "certain","cf","choose","contrariwise","cos","could","cu","day","do","does","doesn't",
 "doing","dost","doth","double","down","dual","during","each","either","else",
 "elsewhere","enough","et","etc","even","ever","every","everybody","everyone",
 "everything","everywhere","except","excepted","excepting","exception","exclude",
 "excluding","exclusive","far","farther","farthest","few","ff","first","for",
 "formerly","forth","forward","from","front","further","furthermore","furthest","get",
 "go","had","halves","hardly","has","hast","hath","have","he","hence","henceforth",
 "her","here","hereabouts","hereafter","hereby","herein","hereto","hereupon","hers",
 "herself","him","himself","hindmost","his","hither","hitherto","how","however",
 "howsoever","i","ie","if","in","inasmuch","inc","include","included","including",
 "indeed","indoors","inside","insomuch","instead","into","inward","inwards","is",
 "it","its","itself","just","kind","kg","km","last","latter","latterly","less","lest",
 "let","like","little","ltd","many","may","maybe","me","meantime","meanwhile","might",
 "moreover","most","mostly","more","mr","mrs","ms","much","must","my","myself",
 "namely","need","neither","never","nevertheless","next","no","nobody","none",
 "nonetheless","noone","nope","nor","not","nothing","notwithstanding","now","nowadays",
 "nowhere","of","off","often","ok","on","once","one","only","onto","or","other",
 "others","otherwise","ought","our","ours","ourselves","out","outside","over","own",
 "per","perhaps","plenty","provide","quite","rather","really","round","said","sake",
 "same","sang","save","saw","see","seeing","seem","seemed","seeming","seems","seen",
 "seldom","selves","sent","several","shalt","she","should","shown","sideways","since",
 "slept","slew","slung","slunk","smote","so","some","somebody","somehow","someone",
 "something","sometime","sometimes","somewhat","somewhere","spake","spat","spoke",
 "spoken","sprang","sprung","stave","staves","still","such","supposing","than","that",
 "the","thee","their","them","themselves","then","thence","thenceforth","there",
 "thereabout","thereabouts","thereafter","thereby","therefore","therein","thereof",
 "thereon","thereto","thereupon","these","they","this","those","thou","though",
 "thrice","through","throughout","thru","thus","thy","thyself","till","to","together",
 "too","toward","towards","ugh","unable","under","underneath","unless","unlike",
 "until","up","upon","upward","upwards","us","use","used","using","very","via","vs",
 "want","was","we","week","well","were","what","whatever","whatsoever","when","whence",
 "whenever","whensoever","where","whereabouts","whereafter","whereas","whereat",
 "whereby","wherefore","wherefrom","wherein","whereinto","whereof","whereon",
 "wheresoever","whereto","whereunto","whereupon","wherever","wherewith","whether",
 "whew","which","whichever","whichsoever","while","whilst","whither","who","whoa",
 "whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will",
 "wilt","with","within","without","worse","worst","would","wow","ye","yet","year",
 "yippee","you","your","yours","yourself","yourselves",
   "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en","html"
   ]
   Transmap = {
   "\xC3\x80" => "A", "\xC3\x81" => "A", "\xC3\x82" => "A", "\xC3\x83" => "A",
   "\xC3\x84" => "A", "\xC3\x85" => "A", "\xC3\x86" => "AE","\xC3\x87" => "C",
   "\xC3\x88" => "E", "\xC3\x89" => "E", "\xC3\x8A" => "E", "\xC3\x8B" => "E",
   "\xC3\x8C" => "I", "\xC3\x8D" => "I", "\xC3\x8E" => "I", "\xC3\x8F" => "I",
   "\xC3\x90" => "D", "\xC3\x91" => "N", "\xC3\x92" => "O", "\xC3\x93" => "O",
   "\xC3\x94" => "O", "\xC3\x95" => "O", "\xC3\x96" => "O", "\xC3\x98" => "O",
   "\xC3\x99" => "U", "\xC3\x9A" => "U", "\xC3\x9B" => "U", "\xC3\x9C" => "U",
   "\xC3\x9D" => "Y", "\xC3\x9E" => "P", "\xC3\x9F" => "ss",
   "\xC3\xA0" => "a", "\xC3\xA1" => "a", "\xC3\xA2" => "a", "\xC3\xA3" => "a",
   "\xC3\xA4" => "a", "\xC3\xA5" => "a", "\xC3\xA6" => "ae","\xC3\xA7" => "c",
   "\xC3\xA8" => "e", "\xC3\xA9" => "e", "\xC3\xAA" => "e", "\xC3\xAB" => "e",
   "\xC3\xAC" => "i", "\xC3\xAD" => "i", "\xC3\xAE" => "i", "\xC3\xAF" => "i",
   "\xC3\xB0" => "o", "\xC3\xB1" => "n", "\xC3\xB2" => "o", "\xC3\xB3" => "o",
   "\xC3\xB4" => "o", "\xC3\xB5" => "o", "\xC3\xB6" => "o", "\xC3\xB8" => "o",
   "\xC3\xB9" => "u", "\xC3\xBA" => "u", "\xC3\xBB" => "u", "\xC3\xBC" => "u",
   "\xC3\xBD" => "y", "\xC3\xBE" => "p", "\xC3\xBF" => "y",
   "\xC4\x80" => "A", "\xC4\x81" => "a", "\xC4\x82" => "A", "\xC4\x83" => "a",
   "\xC4\x84" => "A", "\xC4\x85" => "a", "\xC4\x86" => "C", "\xC4\x87" => "c",
   "\xC4\x88" => "C", "\xC4\x89" => "c", "\xC4\x8A" => "C", "\xC4\x8B" => "c",
   "\xC4\x8C" => "C", "\xC4\x8D" => "c", "\xC4\x8E" => "D", "\xC4\x8F" => "d",
   "\xC4\x90" => "D", "\xC4\x91" => "d", "\xC4\x92" => "E", "\xC4\x93" => "e",
   "\xC4\x94" => "E", "\xC4\x95" => "e", "\xC4\x96" => "E", "\xC4\x97" => "e",
   "\xC4\x98" => "E", "\xC4\x99" => "e", "\xC4\x9A" => "E", "\xC4\x9B" => "e",
   "\xC4\x9C" => "G", "\xC4\x9D" => "g", "\xC4\x9E" => "G", "\xC4\x9F" => "g",
   "\xC4\xA0" => "G", "\xC4\xA1" => "g", "\xC4\xA2" => "G", "\xC4\xA3" => "g",
   "\xC4\xA4" => "H", "\xC4\xA5" => "h", "\xC4\xA6" => "H", "\xC4\xA7" => "h",
   "\xC4\xA8" => "I", "\xC4\xA9" => "i", "\xC4\xAA" => "I", "\xC4\xAB" => "i",
   "\xC4\xAC" => "I", "\xC4\xAD" => "i", "\xC4\xAE" => "I", "\xC4\xAF" => "i",
   "\xC4\xB0" => "I", "\xC4\xB1" => "i", "\xC4\xB2" => "IJ","\xC4\xB3" => "ij",
   "\xC4\xB4" => "J", "\xC4\xB5" => "j", "\xC4\xB6" => "K", "\xC4\xB7" => "k",
   "\xC4\xB8" => "k", "\xC4\xB9" => "L", "\xC4\xBA" => "l", "\xC4\xBB" => "L",
   "\xC4\xBC" => "l", "\xC4\xBD" => "L", "\xC4\xBE" => "l", "\xC4\xBF" => "L",
   "\xC5\x80" => "l", "\xC5\x81" => "L", "\xC5\x82" => "l", "\xC5\x83" => "N",
   "\xC5\x84" => "n", "\xC5\x85" => "N", "\xC5\x86" => "n", "\xC5\x87" => "N",
   "\xC5\x88" => "n", "\xC5\x89" => "n", "\xC5\x8A" => "N", "\xC5\x8B" => "n",
   "\xC5\x8C" => "O", "\xC5\x8D" => "o", "\xC5\x8E" => "O", "\xC5\x8F" => "o",
   "\xC5\x90" => "O", "\xC5\x91" => "o", "\xC5\x92" => "CE","\xC5\x93" => "ce",
   "\xC5\x94" => "R", "\xC5\x95" => "r", "\xC5\x96" => "R", "\xC5\x97" => "r",
   "\xC5\x98" => "R", "\xC5\x99" => "r", "\xC5\x9A" => "S", "\xC5\x9B" => "s",
   "\xC5\x9C" => "S", "\xC5\x9D" => "s", "\xC5\x9E" => "S", "\xC5\x9F" => "s",
   "\xC5\xA0" => "S", "\xC5\xA1" => "s", "\xC5\xA2" => "T", "\xC5\xA3" => "t",
   "\xC5\xA4" => "T", "\xC5\xA5" => "t", "\xC5\xA6" => "T", "\xC5\xA7" => "t",
   "\xC5\xA8" => "U", "\xC5\xA9" => "u", "\xC5\xAA" => "U", "\xC5\xAB" => "u",
   "\xC5\xAC" => "U", "\xC5\xAD" => "u", "\xC5\xAE" => "U", "\xC5\xAF" => "u",
   "\xC5\xB0" => "U", "\xC5\xB1" => "u", "\xC5\xB2" => "U", "\xC5\xB3" => "u",
   "\xC5\xB4" => "W", "\xC5\xB5" => "w", "\xC5\xB6" => "Y", "\xC5\xB7" => "y",
   "\xC5\xB8" => "Y", "\xC5\xB9" => "Z", "\xC5\xBA" => "z", "\xC5\xBB" => "Z",
   "\xC5\xBC" => "z", "\xC5\xBD" => "Z", "\xC5\xBE" => "z", "\xC6\x8F" => "E",
   "\xC6\xA0" => "O", "\xC6\xA1" => "o", "\xC6\xAF" => "U", "\xC6\xB0" => "u",
   "\xC7\x8D" => "A", "\xC7\x8E" => "a", "\xC7\x8F" => "I",
   "\xC7\x90" => "i", "\xC7\x91" => "O", "\xC7\x92" => "o", "\xC7\x93" => "U",
   "\xC7\x94" => "u", "\xC7\x95" => "U", "\xC7\x96" => "u", "\xC7\x97" => "U",
   "\xC7\x98" => "u", "\xC7\x99" => "U", "\xC7\x9A" => "u", "\xC7\x9B" => "U",
   "\xC7\x9C" => "u",
   "\xC7\xBA" => "A", "\xC7\xBB" => "a", "\xC7\xBC" => "AE","\xC7\xBD" => "ae",
   "\xC7\xBE" => "O", "\xC7\xBF" => "o",
   "\xC9\x99" => "e",
   "\xC2\x82" => ",",        # High code comma
   "\xC2\x84" => ",,",       # High code double comma
   "\xC2\x85" => "...",      # Tripple dot
   "\xC2\x88" => "^",        # High carat
   "\xC2\x91" => "\x27",     # Forward single quote
   "\xC2\x92" => "\x27",     # Reverse single quote
   "\xC2\x93" => "\x22",     # Forward double quote
   "\xC2\x94" => "\x22",     # Reverse double quote
   "\xC2\x96" => "-",        # High hyphen
   "\xC2\x97" => "--",       # Double hyphen
   "\xC2\xA6" => "|",        # Split vertical bar
   "\xC2\xAB" => "<<",       # Double less than
   "\xC2\xBB" => ">>",       # Double greater than
   "\xC2\xBC" => "1/4",      # one quarter
   "\xC2\xBD" => "1/2",      # one half
   "\xC2\xBE" => "3/4",      # three quarters
   "\xCA\xBF" => "\x27",     # c-single quote
   "\xCC\xA8" => "",         # modifier - under curve
   "\xCC\xB1" => "",         # modifier - under line
 #  /\W/ => ""
   }
 end
 # Extention of the standard class String with useful function.
 class String
   include Mirimiri
   def unaccent
     # force_encoding is needed with ruby1.9
     Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) }
   end
-  # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise.
+  # Returns +true+ if +self+ belongs to Mirimiri::Stoplist, +false+ otherwise.
   def is_stopword?
     Stoplist.include?(self.downcase)
   end
+  def sequential_dependence_model t=0.85,o=0.10,u=0.05,field=nil
+    d = Mirimiri::Document.new self
+    if field.nil?
+      ematch = d.ngrams(2).collect { |ng| "#1(#{ng})" }
+      pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng})" }
+    else
+      ematch = d.ngrams(2).collect { |ng| "#1(#{ng}).(#{field})" }
+      pmatch = d.ngrams(2).collect { |ng| "#uw8(#{ng}).(#{field})" }
+    end
+    if ematch.empty?
+      if field.nil?
+        ematch = d.words.collect { |ng| "#1(#{ng})" }
+        pmatch = d.words.collect { |ng| "#uw8(#{ng})" }
+      else
+        ematch = d.words.collect { |ng| "#1(#{ng}).(#{field})" }
+        pmatch = d.words.collect { |ng| "#uw8(#{ng}).(#{field})" }
+      end
+    end
+    "#weight ( #{t} #combine( #{self} ) #{o} #combine ( #{ematch.join(" ")} ) #{u} #combine ( #{pmatch.join(" ")} ) )"
+  end
   # Do not use.
   # TODO: rewamp. find why this function is here.
   def remove_special_characters
     self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ')
   end
   # Removes all XML-like tags from +self+.
   #
   #   s = "<html><body>test</body></html>"
   #   s.strip_xml_tags!
   #   s                                     #=> "test"
   def strip_xml_tags!
     replace strip_with_pattern /<\/?[^>]*>/
   end
   # Removes all XML-like tags from +self+.
   #
   #   s = "<html><body>test</body></html>"
   #   s.strip_xml_tags                      #=> "test"
   #   s                                     #=> "<html><body>test</body></html>"
   def strip_xml_tags
     dup.strip_xml_tags!
   end
   # Removes all Javascript sources from +self+.
   #
   #   s = "<script type='text/javascript'>
   #         var skin='vector',
   #         stylepath='http://bits.wikimedia.org/skins-1.5'
   #        </script>
   #
   #        test"
   #   s.strip_javascripts!
   #   s                                     #=> "test"
   def strip_javascripts!
     replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m
   end
   # Removes all Javascript sources from +self+.
   #
   #   s = "<script type='text/javascript'>
   #         var skin='vector',
   #         stylepath='http://bits.wikimedia.org/skins-1.5'
   #        </script>
   #
   #        test"
   #   s.strip_javascripts                   #=> "test"
   def strip_javascripts
     dup.strip_javascripts!
   end
   def strip_stylesheets!
   # TODO: rewamp. dunno what is it.
     replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m
   end
   def strip_stylesheets
     dup.strip_stylesheets!
   end
   # Removes punctuation from +self+.
   #
   #   s = "hello, world. how are you?!"
   #   s.strip_punctuation!
   #   s                                 # => "hello world how are you"
   def strip_punctuation!
     replace strip_with_pattern /[^a-zA-Z0-9\-\s]/
   end
   # Removes punctuation from +self+.
   #
   #   s = "hello, world. how are you?!"
   #   s.strip_punctuation               # => "hello world how are you"
   def strip_punctuation
     dup.strip_punctuation!
   end
   # Returns the text values inside all occurences of a XML tag in +self+
   #
   #   s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre"
   #   s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"]
   def extract_xmltags_values(tag_name)
     self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten
   end
   def strip_with_pattern(pattern)
     require 'cgi'
     CGI::unescapeHTML(self.gsub(pattern,"")).unaccent.encode("UTF-8", {:invalid => :replace, :undef => :replace, :replace => " "})
   end
   private :strip_with_pattern
+end
+module Indri
+  class IndriPrintedDocuments < String
+    def extract_docs
+      self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? }
+    end
+  end
 end

main.rb

Diff comments View file @ e0e33fc

 $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib"))
 require 'mirimiri'
 w = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan")
 p w.entropy("dillinger escape plan")
 p w.tf("guitar")
+query = Indri::IndriQuery.new({:query => "dillinger escape plan".sequential_dependence_model, :count => 10}, "-trecFormat=true -printDocuments=true")
+index = Indri::IndriIndex.new "/mnt/disk1/ClueWeb09_English_1noSpam"
+s = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))