Blame view
main.rb
1.08 KB
7043da90b first commit |
1 |
$LOAD_PATH.unshift File.expand_path(File.join(File.dirname(__FILE__), "lib")) |
b3995017e changing the name... |
2 |
require 'mirimiri' |
b3c021397 faster computing ... |
3 |
require "benchmark" |
fd4cb285a doc changes + doc... |
4 |
|
b0ffa2ad4 finally committin... |
5 |
# Fetch the text content of two Wikipedia pages using their URLs |
b3995017e changing the name... |
6 |
w = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/The_Dillinger_Escape_Plan") |
b0ffa2ad4 finally committin... |
7 8 9 |
u = Mirimiri::WikipediaPage.new("http://en.wikipedia.org/wiki/Pantera") # Compute the entropy of a word sequence, using `w` as context |
a79a22843 new TreeTagger mo... |
10 11 |
p w.entropy("dillinger escape plan") p w.tf("guitar") |
e0e33fca0 new way of queryi... |
12 |
|
b0ffa2ad4 finally committin... |
13 14 15 16 17 18 19 |
# Compute the KL-Divergence between the two pages p w.kl u # Mirimiri also comprises Indri-related classes # Building an Indri query |
e0e33fca0 new way of queryi... |
20 |
query = Indri::IndriQuery.new({:query => "dillinger escape plan".sequential_dependence_model, :count => 10}, "-trecFormat=true -printDocuments=true") |
b0ffa2ad4 finally committin... |
21 22 23 |
# Initializing the index on which the query will be executed # Must have been previously built using `IndriBuildIndex` |
e0e33fca0 new way of queryi... |
24 |
index = Indri::IndriIndex.new "/mnt/disk1/ClueWeb09_English_1noSpam" |
b0ffa2ad4 finally committin... |
25 26 |
# Run the query on the index and fetch the text of the documents |
e0e33fca0 new way of queryi... |
27 |
s = Indri::IndriPrintedDocuments.new(index.runquery(query).force_encoding("ISO-8859-1").encode("UTF-8")) |