Blame view

main.rb 1.08 KB
7043da90b   Romain Deveaud   first commit
1
  $LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib"))
b3995017e   Romain Deveaud   changing the name...
2
  require 'mirimiri'
b3c021397   Romain Deveaud   faster computing ...
3
  require "benchmark"
fd4cb285a   Romain Deveaud   doc changes + doc...
4

b0ffa2ad4   Romain Deveaud   finally committin...
5
  # Fetch the text content of two Wikipedia pages using their URLs
b3995017e   Romain Deveaud   changing the name...
6
  w = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan")
b0ffa2ad4   Romain Deveaud   finally committin...
7
8
9
  u = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/Pantera")
  
  # Compute the entropy of a word sequence, using `w` as context
a79a22843   Romain Deveaud   new TreeTagger mo...
10
11
  p w.entropy("dillinger escape plan")
  p w.tf("guitar")
e0e33fca0   Romain Deveaud   new way of queryi...
12

b0ffa2ad4   Romain Deveaud   finally committin...
13
14
15
16
17
18
19
  # Compute the KL-Divergence between the two pages
  p w.kl u
  
  
  # Mirimiri also comprises Indri-related classes
  
  # Building an Indri query
e0e33fca0   Romain Deveaud   new way of queryi...
20
  query = Indri::IndriQuery.new({:query => "dillinger escape plan".sequential_dependence_model, :count => 10}, "-trecFormat=true -printDocuments=true")
b0ffa2ad4   Romain Deveaud   finally committin...
21
22
23
  
  # Initializing the index on which the query will be executed
  # Must have been previously built using `IndriBuildIndex`
e0e33fca0   Romain Deveaud   new way of queryi...
24
  index = Indri::IndriIndex.new "/mnt/disk1/ClueWeb09_English_1noSpam"
b0ffa2ad4   Romain Deveaud   finally committin...
25
26
  
  # Run the query on the index and fetch the text of the documents
e0e33fca0   Romain Deveaud   new way of queryi...
27
  s = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8"))