From aa386f5530e2fcc8866a8c83ded03b4f672c4d18 Mon Sep 17 00:00:00 2001 From: Romain Deveaud Date: Fri, 2 Mar 2012 16:23:51 +0100 Subject: [PATCH] changes in query, new index class --- lib/mirimiri/document.rb | 24 +++++++++++++++++++++++- lib/mirimiri/index.rb | 39 +++++++++++++++++++++++++++++++++++++++ lib/mirimiri/query.rb | 20 ++++++++++++++++++-- lib/mirimiri/string.rb | 2 +- 4 files changed, 81 insertions(+), 4 deletions(-) create mode 100644 lib/mirimiri/index.rb diff --git a/lib/mirimiri/document.rb b/lib/mirimiri/document.rb index 97af4ca..30cdc00 100644 --- a/lib/mirimiri/document.rb +++ b/lib/mirimiri/document.rb @@ -117,12 +117,15 @@ module Mirimiri Net::HTTP.get(URI.parse(url)) end + # WebDocument constructor, the content of the Document is the HTML page # without the tags. def initialize(url,only_tags=nil) + require 'sanitize' + @url = url content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("") - super content.strip_javascripts.strip_xml_tags + super Sanitize.clean(content.unaccent.toutf8.force_encoding("UTF-8"), :remove_contents => ['script']) end end @@ -155,5 +158,24 @@ module Mirimiri WikipediaPage.get_url(title[0]) unless title.nil? || title.empty? end + def self.extract_anchors(url) + self.get_content(url).extract_xmltags_values('p').join(' ').scan(/(.+?)<\/a>/).delete_if { |a| a[0] =~ /^\/wiki\/.*$/.negated } + end + end + + class FreebasePage < WebDocument + require 'net/http' + require 'kconv' + require 'json' + + def self.search_article_ids query,limit + raise ArgumentError, "Bad encoding", name unless name.isutf8 + + JSON.parse(Net::HTTP.get( URI.parse "http://api.freebase.com/api/service/search?query=#{query.gsub(" ","+")}&limit=#{limit}" ))['result'].collect { |a| a['article']['id'] unless a['article'].nil? }.compact + end + + def self.get_url id + "http://api.freebase.com/api/trans/raw#{id}" + end end end diff --git a/lib/mirimiri/index.rb b/lib/mirimiri/index.rb new file mode 100644 index 0000000..7316734 --- /dev/null +++ b/lib/mirimiri/index.rb @@ -0,0 +1,39 @@ +#!/usr/bin/env ruby + +#-- +# This file is a part of the mirimiri library +# +# Copyright (C) 2010-2012 Romain Deveaud +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +#++ + +class Index +end + +module Indri + + class IndriIndex + + def exec indriquery + raise ArgumentError, 'Argument is not an IndriQuery' unless indriquery.is_a? Indri::IndriQuery + + query = "IndriRunQuery -query#{indriquery.query} -index=#{@path}" + + query += " -count=#{indriquery.count}" unless indriquery.count.nil? + query += " -rule=method:#{indriquery.sm_method},#{indriquery.sm_param}:#{indriquery.sm_value}" unless indriquery.sm_method.nil? + query += " #{indriquery.args}" unless indriquery.args.nil? + end + end +end diff --git a/lib/mirimiri/query.rb b/lib/mirimiri/query.rb index 66d0887..1fe5cb8 100644 --- a/lib/mirimiri/query.rb +++ b/lib/mirimiri/query.rb @@ -27,10 +27,11 @@ module Indri class Parameters attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline - def initialize(corpus,count="1000",mem="1g",offset="1",run_id="default",print_query=false,print_docs=false) + def initialize(corpus,count="1000",mem="1g",threads="1",offset="1",run_id="default",print_query=false,print_docs=false) @index_path = corpus @memory = mem @count = count + @threads = threads @offset = offset @run_id = run_id @print_query = print_query ? "true" : "false" @@ -41,6 +42,7 @@ module Indri h = "#{@memory}\n" h += "#{@index_path}\n" h += "#{@count}\n" + h += "#{@threads}\n" unless @baseline.nil? h += "#{@baseline}\n" else @@ -56,7 +58,7 @@ module Indri end end - class IndriQuery < Query + class IndriQueryOld < Query attr_accessor :id, :query, :rule def initialize(id,query) @@ -78,6 +80,20 @@ module Indri end end + class IndriQuery < Query + attr_accessor :query, :count, :sm_method, :sm_param, :sm_value, :args + + def initialize atts={},args=nil + raise ArgumentError, 'Argument 1 must be a Hash' unless args.is_a? Hash + atts.each do |k,v| + instance_variable_set("@#{k}", v) unless v.nil? + end + + raise ArgumentError, 'Argument 2 must be a String' unless args.is_a? String + @args = args + end + end + class IndriQueries attr_accessor :params, :queries diff --git a/lib/mirimiri/string.rb b/lib/mirimiri/string.rb index d91a7a7..18704e2 100644 --- a/lib/mirimiri/string.rb +++ b/lib/mirimiri/string.rb @@ -67,7 +67,7 @@ module Mirimiri "whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will", "wilt","with","within","without","worse","worst","would","wow","ye","yet","year", "yippee","you","your","yours","yourself","yourselves", - "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en" + "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en","html" ] Transmap = { -- 1.8.2.3