Commit aa386f5530e2fcc8866a8c83ded03b4f672c4d18

Authored by Romain Deveaud
1 parent b768fe9411
Exists in master

changes in query, new index class

Showing 4 changed files with 81 additions and 4 deletions Side-by-side Diff

lib/mirimiri/document.rb
... ... @@ -117,12 +117,15 @@
117 117 Net::HTTP.get(URI.parse(url))
118 118 end
119 119  
  120 +
120 121 # WebDocument constructor, the content of the Document is the HTML page
121 122 # without the tags.
122 123 def initialize(url,only_tags=nil)
  124 + require 'sanitize'
  125 +
123 126 @url = url
124 127 content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")
125   - super content.strip_javascripts.strip_xml_tags
  128 + super Sanitize.clean(content.unaccent.toutf8.force_encoding("UTF-8"), :remove_contents => ['script'])
126 129 end
127 130 end
128 131  
... ... @@ -155,6 +158,25 @@
155 158 WikipediaPage.get_url(title[0]) unless title.nil? || title.empty?
156 159 end
157 160  
  161 + def self.extract_anchors(url)
  162 + self.get_content(url).extract_xmltags_values('p').join(' ').scan(/<a href="(.+?)" title=.*?>(.+?)<\/a>/).delete_if { |a| a[0] =~ /^\/wiki\/.*$/.negated }
  163 + end
  164 + end
  165 +
  166 + class FreebasePage < WebDocument
  167 + require 'net/http'
  168 + require 'kconv'
  169 + require 'json'
  170 +
  171 + def self.search_article_ids query,limit
  172 + raise ArgumentError, "Bad encoding", name unless name.isutf8
  173 +
  174 + JSON.parse(Net::HTTP.get( URI.parse "http://api.freebase.com/api/service/search?query=#{query.gsub(" ","+")}&limit=#{limit}" ))['result'].collect { |a| a['article']['id'] unless a['article'].nil? }.compact
  175 + end
  176 +
  177 + def self.get_url id
  178 + "http://api.freebase.com/api/trans/raw#{id}"
  179 + end
158 180 end
159 181 end
lib/mirimiri/index.rb
  1 +#!/usr/bin/env ruby
  2 +
  3 +#--
  4 +# This file is a part of the mirimiri library
  5 +#
  6 +# Copyright (C) 2010-2012 Romain Deveaud <romain.deveaud@gmail.com>
  7 +#
  8 +# This program is free software: you can redistribute it and/or modify
  9 +# it under the terms of the GNU General Public License as published by
  10 +# the Free Software Foundation, either version 3 of the License, or
  11 +# (at your option) any later version.
  12 +#
  13 +# This program is distributed in the hope that it will be useful,
  14 +# but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  16 +# GNU General Public License for more details.
  17 +#
  18 +# You should have received a copy of the GNU General Public License
  19 +# along with this program. If not, see <http://www.gnu.org/licenses/>.
  20 +#++
  21 +
  22 +class Index
  23 +end
  24 +
  25 +module Indri
  26 +
  27 + class IndriIndex
  28 +
  29 + def exec indriquery
  30 + raise ArgumentError, 'Argument is not an IndriQuery' unless indriquery.is_a? Indri::IndriQuery
  31 +
  32 + query = "IndriRunQuery -query#{indriquery.query} -index=#{@path}"
  33 +
  34 + query += " -count=#{indriquery.count}" unless indriquery.count.nil?
  35 + query += " -rule=method:#{indriquery.sm_method},#{indriquery.sm_param}:#{indriquery.sm_value}" unless indriquery.sm_method.nil?
  36 + query += " #{indriquery.args}" unless indriquery.args.nil?
  37 + end
  38 + end
  39 +end
lib/mirimiri/query.rb
... ... @@ -27,10 +27,11 @@
27 27 class Parameters
28 28 attr_accessor :index_path, :memory, :count, :offset, :run_id, :print_query, :print_docs, :rule, :baseline
29 29  
30   - def initialize(corpus,count="1000",mem="1g",offset="1",run_id="default",print_query=false,print_docs=false)
  30 + def initialize(corpus,count="1000",mem="1g",threads="1",offset="1",run_id="default",print_query=false,print_docs=false)
31 31 @index_path = corpus
32 32 @memory = mem
33 33 @count = count
  34 + @threads = threads
34 35 @offset = offset
35 36 @run_id = run_id
36 37 @print_query = print_query ? "true" : "false"
... ... @@ -41,6 +42,7 @@
41 42 h = "<memory>#{@memory}</memory>\n"
42 43 h += "<index>#{@index_path}</index>\n"
43 44 h += "<count>#{@count}</count>\n"
  45 + h += "<threads>#{@threads}</threads>\n"
44 46 unless @baseline.nil?
45 47 h += "<baseline>#{@baseline}</baseline>\n"
46 48 else
... ... @@ -56,7 +58,7 @@
56 58 end
57 59 end
58 60  
59   - class IndriQuery < Query
  61 + class IndriQueryOld < Query
60 62 attr_accessor :id, :query, :rule
61 63  
62 64 def initialize(id,query)
... ... @@ -75,6 +77,20 @@
75 77  
76 78 def exec params
77 79 `IndriRunQuery -query='#{@query}' -index=#{params.index_path} -count=#{params.count} -rule=method:dirichlet,mu:2500 -trecFormat`
  80 + end
  81 + end
  82 +
  83 + class IndriQuery < Query
  84 + attr_accessor :query, :count, :sm_method, :sm_param, :sm_value, :args
  85 +
  86 + def initialize atts={},args=nil
  87 + raise ArgumentError, 'Argument 1 must be a Hash' unless args.is_a? Hash
  88 + atts.each do |k,v|
  89 + instance_variable_set("@#{k}", v) unless v.nil?
  90 + end
  91 +
  92 + raise ArgumentError, 'Argument 2 must be a String' unless args.is_a? String
  93 + @args = args
78 94 end
79 95 end
80 96  
lib/mirimiri/string.rb
... ... @@ -67,7 +67,7 @@
67 67 "whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will",
68 68 "wilt","with","within","without","worse","worst","would","wow","ye","yet","year",
69 69 "yippee","you","your","yours","yourself","yourselves",
70   - "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en"
  70 + "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en","html"
71 71 ]
72 72  
73 73 Transmap = {