finally committing some recent changes

Romain Deveaud
1 parent b3c0213975
Showing 7 changed files with 146 additions and 16 deletions Side-by-side Diff
README.markdown
lib/mirimiri/document.rb
lib/mirimiri/index.rb
lib/mirimiri/query.rb
lib/mirimiri/result.rb
lib/mirimiri/string.rb
main.rb
 # mirimiri
  
-Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
+The various tools of this project were developed for research purposes during 
+my Ph.D. and heavily rely on the use of Indri (<http://lemurproject.org/indri.php>).
+Setting up Ruby is not as painful as it used to be since RVM (<https://rvm.io/>), 
+visit at least these two websites before trying to use `mirimiri`.
+
+
+Copyright (C) 2010-2013 Romain Deveaud <romain.deveaud@gmail.com>
  
 > The Fijian monkey-faced bat (Mirimiri acrodonta), also called the Fiji 
 > Flying Fox, is an Old World fruit bat endemic to Fiji. It was discovered 
@@ -25,7 +25,7 @@
  
   # A Document is a bag of words and is constructed from a string.
   class Document
-    attr_reader :words, :doc_content, :count_words
+    attr_reader :words, :doc_content, :xcount
  
     # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
     # and the \\W special escape).
@@ -36,7 +36,7 @@
  
       @doc_content.split.each do |w|
         w.split(/\W/).each do |sw| 
-          wo.push(sw.downcase) if sw =~ /[a-zA-Z]/ 
+          wo.push(sw.downcase) if sw =~ /[[:alpha:]]/ 
         end
       end
  
@@ -80,7 +80,7 @@
       en = 0.0
  
       s.split.each do |w|
-        p_wi = @count_words[w].to_f/@words.count.to_f
+        p_wi = @xcount[w].to_f/@words.count.to_f
         en += p_wi*Math.log2(p_wi)
       end
  
@@ -101,7 +101,7 @@
       size = s.split.size
  
       if size == 1
-        p_wi = @count_words[s].to_f/@words.count.to_f
+        p_wi = @xcount[s].to_f/@words.count.to_f
         en += p_wi*Math.log(p_wi)
       elsif size > 1
         ng_size = ngrams(size)
  
  
  
@@ -117,14 +117,28 @@
     #
     #   tf("guitar") #=> 0.000380372765310004
     def tf(s)
-      @count_words[s].to_f/@words.size.to_f
+      @xcount[s].to_f/@words.size.to_f
     end
  
+    # Computes the KL divergence between the language model of the +self+
+    # and the language model of +doc+. 
+    #
+    # KL is not symmetric, see http://en.wikipedia.org/wiki/Kullback-Leibler_divergence
+    # for more information.
+    #
+    #   d1.kl(d2) #=> 0.2971808085725761
+    def kl(doc)
+      raise ArgumentError, 'Argument is not a Mirimiri::Document' unless doc.is_a? Mirimiri::Document 
+     
+      vocab = self.words & doc.words
  
+      vocab.inject(0.0) { |res,w| res + self.tf(w)*Math.log(self.tf(w)/doc.tf(w)) }
+    end
+
     def initialize(content="")
       @doc_content = content
       @words = format_words
-      @count_words = count_words
+      @xcount = count_words
       @ngrams = {}
     end
  
@@ -149,7 +163,7 @@
  
       @url = url
       content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")
-      super Sanitize.clean(content.unaccent.toutf8.force_encoding("UTF-8"), :remove_contents => ['script'])
+      super Sanitize.clean(content, :remove_contents => ['script','style'])
     end
   end
  
  
@@ -161,9 +175,9 @@
  
  
     def self.search_wikipedia_titles(name)
-      raise ArgumentError, "Bad encoding", name unless name.isutf8
+#      raise ArgumentError, "Bad encoding", name unless name.isutf8
  
-      res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).unaccent.toutf8).elements['api/query/search']
+      res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&srlimit=20&format=xml" ).force_encoding("ISO-8859-1").encode("UTF-8")).elements['api/query/search']
  
      res.collect { |e| e.attributes['title'] } unless res.nil?
     end
@@ -32,7 +32,7 @@
     end
  
     def runquery indriquery
-      raise ArgumentError, 'Argument is not an IndriQuery' unless indriquery.is_a? Indri::IndriQuery
+      raise ArgumentError, 'Argument is not an Indri::IndriQuery' unless indriquery.is_a? Indri::IndriQuery
  
       query = "IndriRunQuery -query=\"#{indriquery.query}\" -index=#{@path}"
  
@@ -20,6 +20,9 @@
 #++
  
 class Query
+  attr_accessor :query
+
+
 end
  
 module Indri
@@ -27,7 +30,7 @@
   class Parameters
     attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline
  
-    def initialize(corpus,count="1000",mem="1g",threads="1",offset="1",run_id="default",print_query=false,print_docs=false)
+    def initialize(corpus,count="1000",mem="1g",threads="1",offset="1",run_id="default",print_passages=false,print_query=false,print_docs=false)
       @index_path  = corpus
       @memory      = mem
       @count       = count
  
@@ -36,11 +39,15 @@
       @run_id      = run_id
       @print_query = print_query ? "true" : "false"
       @print_docs  = print_docs  ? "true" : "false"
+      @print_passages  = print_passages  ? "true" : "false"
+      @indexes     = [corpus]
     end
  
     def to_s
       h = "<memory>#{@memory}</memory>\n"
-      h += "<index>#{@index_path}</index>\n"
+      @indexes.each do |i|
+        h += "<index>#{i}</index>\n"
+      end
       h += "<count>#{@count}</count>\n"
       h += "<threads>#{@threads}</threads>\n"
       unless @baseline.nil?
  
@@ -51,11 +58,16 @@
       h += "<trecFormat>true</trecFormat>\n"
       h += "<queryOffset>#{@offset}</queryOffset>\n"
       h += "<runID>#{@run_id}</runID>\n"
+      h += "<printPassages>#{@print_passages}</printPassages>\n"
       h += "<printQuery>#{@print_query}</printQuery>\n"
       h += "<printDocuments>#{@print_docs}</printDocuments>\n"
  
       h
     end
+
+    def add_index path
+      @indexes << path
+    end
   end
  
   class IndriQueryOld < Query
@@ -91,6 +103,10 @@
  
       raise ArgumentError, 'Argument 2 must be a String' unless (args.is_a?(String) || args.nil?)
       @args = args 
+    end
+
+    def clarity index_path,terms=10,documents=5
+      `clarity -index=#{index_path} -documents=#{documents} -terms=#{terms} -smoothing=\"method:#{@sm_method},#{@sm_param}:#{@sm_value}\" -query=\"#{query}\"`.split("=").last.strip
     end
   end
  
+#!/usr/bin/env ruby
+
+#--
+# This file is a part of the mirimiri library
+#
+# Copyright (C) 2010-2012 Romain Deveaud <romain.deveaud@gmail.com>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#++
+
+module Mirimiri
+
+  # This class represents one line of a TREC-formatted retrieval
+  # result. Typical output of Indri or Terrier.
+  class TrecResult
+    attr_accessor :topic, :doc, :rank, :score, :run
+
+    def initialize arg
+      t = arg.split 
+      @topic = t[0]
+      @doc   = t[2]
+      @rank  = t[3]
+      @score = t[4]
+      @run   = t[5]
+    end
+  end
+
+  # This class represents the output of trec_eval, when
+  # -q option is given.
+  class TrecEval
+    attr_accessor :metric, :run, :queries
+
+    def initialize arg
+      @queries = {}
+
+      arg.each_line do |line|
+        t = line.split
+        @metric = t[0] if @metric.nil?
+        @queries[t[1]] = t[2].to_f if t[1].is_integer?
+      end
+    end
+  end
+
+  # An array of TrecResult, or a run.
+  class TrecResults < Array
+
+    def initialize args
+      super args.collect { |res| TrecResult.new res }
+    end
+  end
+end
@@ -67,7 +67,8 @@
 "whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will",
 "wilt","with","within","without","worse","worst","would","wow","ye","yet","year",
 "yippee","you","your","yours","yourself","yourselves",
-  "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en","html"
+  "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en","html",
+  "amp","nbsp","quot"
   ]
  
   Transmap = { 
@@ -158,6 +159,7 @@
  
   def unaccent
     # force_encoding is needed with ruby1.9
+#    Transmap.inject(self) { |str, (utf8, asc)| str.gsub(utf8, asc) }
     Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) }
   end
  
@@ -166,7 +168,15 @@
     self.split.all? { |e| Stoplist.include?(e.downcase) }
   end
  
-  def sequential_dependence_model t=0.85,o=0.10,u=0.05,field=nil
+  def is_integer?
+    !self.empty? && self =~ /\A\d+\Z/ 
+  end
+
+  def numeric?
+    Float(self) != nil rescue false
+  end
+
+  def sequential_dependence_model field=nil,t=0.85,o=0.10,u=0.05
     d = Mirimiri::Document.new self
  
     if field.nil?
@@ -288,7 +298,13 @@
   class IndriPrintedDocuments < String
  
     def extract_docs
-      self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? }  
+      self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? } 
+    end
+
+    def extract_docs_score
+      score = self.scan(/\d+ Q0 .+ \d+ (-\d+.\d+) .+/).flatten
+      name  = self.scan(/\d+ Q0 (.+) \d+ -\d+.\d+ .+/).collect { |n| n.first.scan(/(\d+).xml/).first }
+      return self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ |x| x.empty? },score,name 
     end
   end
 end
@@ -3,11 +3,27 @@
 require 'mirimiri'
 require "benchmark"
  
+# Fetch the text content of two Wikipedia pages using their URLs
 w = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan")
+u = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/Pantera")
+
+# Compute the entropy of a word sequence, using `w` as context
 p w.entropy("dillinger escape plan")
 p w.tf("guitar")
  
+# Compute the KL-Divergence between the two pages
+p w.kl u
+
+
+# Mirimiri also comprises Indri-related classes
+
+# Building an Indri query
 query = Indri::IndriQuery.new({:query => "dillinger escape plan".sequential_dependence_model, :count => 10}, "-trecFormat=true -printDocuments=true")
+
+# Initializing the index on which the query will be executed
+# Must have been previously built using `IndriBuildIndex`
 index = Indri::IndriIndex.new "/mnt/disk1/ClueWeb09_English_1noSpam"
+
+# Run the query on the index and fetch the text of the documents
 s = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))
1	1	# mirimiri
2	2
3		-Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
	3	+The various tools of this project were developed for research purposes during
	4	+my Ph.D. and heavily rely on the use of Indri (<http://lemurproject.org/indri.php>).
	5	+Setting up Ruby is not as painful as it used to be since RVM (<https://rvm.io/>),
	6	+visit at least these two websites before trying to use `mirimiri`.
	7	+
	8	+
	9	+Copyright (C) 2010-2013 Romain Deveaud <romain.deveaud@gmail.com>
4	10
5	11	> The Fijian monkey-faced bat (Mirimiri acrodonta), also called the Fiji
6	12	> Flying Fox, is an Old World fruit bat endemic to Fiji. It was discovered
...	...	@@ -25,7 +25,7 @@
25	25
26	26	# A Document is a bag of words and is constructed from a string.
27	27	class Document
28		- attr_reader :words, :doc_content, :count_words
	28	+ attr_reader :words, :doc_content, :xcount
29	29
30	30	# Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
31	31	# and the \\W special escape).
...	...	@@ -36,7 +36,7 @@
36	36
37	37	@doc_content.split.each do \|w\|
38	38	w.split(/\W/).each do \|sw\|
39		- wo.push(sw.downcase) if sw =~ /[a-zA-Z]/
	39	+ wo.push(sw.downcase) if sw =~ /[[:alpha:]]/
40	40	end
41	41	end
42	42
...	...	@@ -80,7 +80,7 @@
80	80	en = 0.0
81	81
82	82	s.split.each do \|w\|
83		- p_wi = @count_words[w].to_f/@words.count.to_f
	83	+ p_wi = @xcount[w].to_f/@words.count.to_f
84	84	en += p_wi*Math.log2(p_wi)
85	85	end
86	86
...	...	@@ -101,7 +101,7 @@
101	101	size = s.split.size
102	102
103	103	if size == 1
104		- p_wi = @count_words[s].to_f/@words.count.to_f
	104	+ p_wi = @xcount[s].to_f/@words.count.to_f
105	105	en += p_wi*Math.log(p_wi)
106	106	elsif size > 1
107	107	ng_size = ngrams(size)
108	108
109	109
110	110
...	...	@@ -117,14 +117,28 @@
117	117	#
118	118	# tf("guitar") #=> 0.000380372765310004
119	119	def tf(s)
120		- @count_words[s].to_f/@words.size.to_f
	120	+ @xcount[s].to_f/@words.size.to_f
121	121	end
122	122
	123	+ # Computes the KL divergence between the language model of the +self+
	124	+ # and the language model of +doc+.
	125	+ #
	126	+ # KL is not symmetric, see http://en.wikipedia.org/wiki/Kullback-Leibler_divergence
	127	+ # for more information.
	128	+ #
	129	+ # d1.kl(d2) #=> 0.2971808085725761
	130	+ def kl(doc)
	131	+ raise ArgumentError, 'Argument is not a Mirimiri::Document' unless doc.is_a? Mirimiri::Document
	132	+
	133	+ vocab = self.words & doc.words
123	134
	135	+ vocab.inject(0.0) { \|res,w\| res + self.tf(w)*Math.log(self.tf(w)/doc.tf(w)) }
	136	+ end
	137	+
124	138	def initialize(content="")
125	139	@doc_content = content
126	140	@words = format_words
127		- @count_words = count_words
	141	+ @xcount = count_words
128	142	@ngrams = {}
129	143	end
130	144
...	...	@@ -149,7 +163,7 @@
149	163
150	164	@url = url
151	165	content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")
152		- super Sanitize.clean(content.unaccent.toutf8.force_encoding("UTF-8"), :remove_contents => ['script'])
	166	+ super Sanitize.clean(content, :remove_contents => ['script','style'])
153	167	end
154	168	end
155	169
156	170
...	...	@@ -161,9 +175,9 @@
161	175
162	176
163	177	def self.search_wikipedia_titles(name)
164		- raise ArgumentError, "Bad encoding", name unless name.isutf8
	178	+# raise ArgumentError, "Bad encoding", name unless name.isutf8
165	179
166		- res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).unaccent.toutf8).elements['api/query/search']
	180	+ res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&srlimit=20&format=xml" ).force_encoding("ISO-8859-1").encode("UTF-8")).elements['api/query/search']
167	181
168	182	res.collect { \|e\| e.attributes['title'] } unless res.nil?
169	183	end
...	...	@@ -32,7 +32,7 @@
32	32	end
33	33
34	34	def runquery indriquery
35		- raise ArgumentError, 'Argument is not an IndriQuery' unless indriquery.is_a? Indri::IndriQuery
	35	+ raise ArgumentError, 'Argument is not an Indri::IndriQuery' unless indriquery.is_a? Indri::IndriQuery
36	36
37	37	query = "IndriRunQuery -query=\"#{indriquery.query}\" -index=#{@path}"
38	38
...	...	@@ -20,6 +20,9 @@
20	20	#++
21	21
22	22	class Query
	23	+ attr_accessor :query
	24	+
	25	+
23	26	end
24	27
25	28	module Indri
...	...	@@ -27,7 +30,7 @@
27	30	class Parameters
28	31	attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline
29	32
30		- def initialize(corpus,count="1000",mem="1g",threads="1",offset="1",run_id="default",print_query=false,print_docs=false)
	33	+ def initialize(corpus,count="1000",mem="1g",threads="1",offset="1",run_id="default",print_passages=false,print_query=false,print_docs=false)
31	34	@index_path = corpus
32	35	@memory = mem
33	36	@count = count
34	37
...	...	@@ -36,11 +39,15 @@
36	39	@run_id = run_id
37	40	@print_query = print_query ? "true" : "false"
38	41	@print_docs = print_docs ? "true" : "false"
	42	+ @print_passages = print_passages ? "true" : "false"
	43	+ @indexes = [corpus]
39	44	end
40	45
41	46	def to_s
42	47	h = "<memory>#{@memory}</memory>\n"
43		- h += "<index>#{@index_path}</index>\n"
	48	+ @indexes.each do \|i\|
	49	+ h += "<index>#{i}</index>\n"
	50	+ end
44	51	h += "<count>#{@count}</count>\n"
45	52	h += "<threads>#{@threads}</threads>\n"
46	53	unless @baseline.nil?
47	54
...	...	@@ -51,11 +58,16 @@
51	58	h += "<trecFormat>true</trecFormat>\n"
52	59	h += "<queryOffset>#{@offset}</queryOffset>\n"
53	60	h += "<runID>#{@run_id}</runID>\n"
	61	+ h += "<printPassages>#{@print_passages}</printPassages>\n"
54	62	h += "<printQuery>#{@print_query}</printQuery>\n"
55	63	h += "<printDocuments>#{@print_docs}</printDocuments>\n"
56	64
57	65	h
58	66	end
	67	+
	68	+ def add_index path
	69	+ @indexes << path
	70	+ end
59	71	end
60	72
61	73	class IndriQueryOld < Query
...	...	@@ -91,6 +103,10 @@
91	103
92	104	raise ArgumentError, 'Argument 2 must be a String' unless (args.is_a?(String) \|\| args.nil?)
93	105	@args = args
	106	+ end
	107	+
	108	+ def clarity index_path,terms=10,documents=5
	109	+ `clarity -index=#{index_path} -documents=#{documents} -terms=#{terms} -smoothing=\"method:#{@sm_method},#{@sm_param}:#{@sm_value}\" -query=\"#{query}\"`.split("=").last.strip
94	110	end
95	111	end
96	112
	1	+#!/usr/bin/env ruby
	2	+
	3	+#--
	4	+# This file is a part of the mirimiri library
	5	+#
	6	+# Copyright (C) 2010-2012 Romain Deveaud <romain.deveaud@gmail.com>
	7	+#
	8	+# This program is free software: you can redistribute it and/or modify
	9	+# it under the terms of the GNU General Public License as published by
	10	+# the Free Software Foundation, either version 3 of the License, or
	11	+# (at your option) any later version.
	12	+#
	13	+# This program is distributed in the hope that it will be useful,
	14	+# but WITHOUT ANY WARRANTY; without even the implied warranty of
	15	+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	16	+# GNU General Public License for more details.
	17	+#
	18	+# You should have received a copy of the GNU General Public License
	19	+# along with this program. If not, see <http://www.gnu.org/licenses/>.
	20	+#++
	21	+
	22	+module Mirimiri
	23	+
	24	+ # This class represents one line of a TREC-formatted retrieval
	25	+ # result. Typical output of Indri or Terrier.
	26	+ class TrecResult
	27	+ attr_accessor :topic, :doc, :rank, :score, :run
	28	+
	29	+ def initialize arg
	30	+ t = arg.split
	31	+ @topic = t[0]
	32	+ @doc = t[2]
	33	+ @rank = t[3]
	34	+ @score = t[4]
	35	+ @run = t[5]
	36	+ end
	37	+ end
	38	+
	39	+ # This class represents the output of trec_eval, when
	40	+ # -q option is given.
	41	+ class TrecEval
	42	+ attr_accessor :metric, :run, :queries
	43	+
	44	+ def initialize arg
	45	+ @queries = {}
	46	+
	47	+ arg.each_line do \|line\|
	48	+ t = line.split
	49	+ @metric = t[0] if @metric.nil?
	50	+ @queries[t[1]] = t[2].to_f if t[1].is_integer?
	51	+ end
	52	+ end
	53	+ end
	54	+
	55	+ # An array of TrecResult, or a run.
	56	+ class TrecResults < Array
	57	+
	58	+ def initialize args
	59	+ super args.collect { \|res\| TrecResult.new res }
	60	+ end
	61	+ end
	62	+end
...	...	@@ -67,7 +67,8 @@
67	67	"whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will",
68	68	"wilt","with","within","without","worse","worst","would","wow","ye","yet","year",
69	69	"yippee","you","your","yours","yourself","yourselves",
70		- "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en","html"
	70	+ "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en","html",
	71	+ "amp","nbsp","quot"
71	72	]
72	73
73	74	Transmap = {
...	...	@@ -158,6 +159,7 @@
158	159
159	160	def unaccent
160	161	# force_encoding is needed with ruby1.9
	162	+# Transmap.inject(self) { \|str, (utf8, asc)\| str.gsub(utf8, asc) }
161	163	Transmap.inject(self.force_encoding("ASCII-8BIT")) { \|str, (utf8, asc)\| str.gsub(utf8, asc) }
162	164	end
163	165
...	...	@@ -166,7 +168,15 @@
166	168	self.split.all? { \|e\| Stoplist.include?(e.downcase) }
167	169	end
168	170
169		- def sequential_dependence_model t=0.85,o=0.10,u=0.05,field=nil
	171	+ def is_integer?
	172	+ !self.empty? && self =~ /\A\d+\Z/
	173	+ end
	174	+
	175	+ def numeric?
	176	+ Float(self) != nil rescue false
	177	+ end
	178	+
	179	+ def sequential_dependence_model field=nil,t=0.85,o=0.10,u=0.05
170	180	d = Mirimiri::Document.new self
171	181
172	182	if field.nil?
...	...	@@ -288,7 +298,13 @@
288	298	class IndriPrintedDocuments < String
289	299
290	300	def extract_docs
291		- self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ \|x\| x.empty? }
	301	+ self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ \|x\| x.empty? }
	302	+ end
	303	+
	304	+ def extract_docs_score
	305	+ score = self.scan(/\d+ Q0 .+ \d+ (-\d+.\d+) .+/).flatten
	306	+ name = self.scan(/\d+ Q0 (.+) \d+ -\d+.\d+ .+/).collect { \|n\| n.first.scan(/(\d+).xml/).first }
	307	+ return self.split(/\d+ Q0 .+ \d+ -\d+.\d+ .+/).delete_if{ \|x\| x.empty? },score,name
292	308	end
293	309	end
294	310	end
...	...	@@ -3,11 +3,27 @@
3	3	require 'mirimiri'
4	4	require "benchmark"
5	5
	6	+# Fetch the text content of two Wikipedia pages using their URLs
6	7	w = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan")
	8	+u = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/Pantera")
	9	+
	10	+# Compute the entropy of a word sequence, using `w` as context
7	11	p w.entropy("dillinger escape plan")
8	12	p w.tf("guitar")
9	13
	14	+# Compute the KL-Divergence between the two pages
	15	+p w.kl u
	16	+
	17	+
	18	+# Mirimiri also comprises Indri-related classes
	19	+
	20	+# Building an Indri query
10	21	query = Indri::IndriQuery.new({:query => "dillinger escape plan".sequential_dependence_model, :count => 10}, "-trecFormat=true -printDocuments=true")
	22	+
	23	+# Initializing the index on which the query will be executed
	24	+# Must have been previously built using `IndriBuildIndex`
11	25	index = Indri::IndriIndex.new "/mnt/disk1/ClueWeb09_English_1noSpam"
	26	+
	27	+# Run the query on the index and fetch the text of the documents
12	28	s = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))