Deveaud Romain / mirimiri

1

#!/usr/bin/env ruby

1

#!/usr/bin/env ruby

2

3

#--

3

#--

4

# This file is a part of the mirimiri library

4

# This file is a part of the mirimiri library

5

#

5

#

6

7

#

7

#

8

# This program is free software: you can redistribute it and/or modify

8

# This program is free software: you can redistribute it and/or modify

9

# it under the terms of the GNU General Public License as published by

9

# it under the terms of the GNU General Public License as published by

10

# the Free Software Foundation, either version 3 of the License, or

10

# the Free Software Foundation, either version 3 of the License, or

11

# (at your option) any later version.

11

# (at your option) any later version.

12

#

12

#

13

# This program is distributed in the hope that it will be useful,

13

# This program is distributed in the hope that it will be useful,

14

# but WITHOUT ANY WARRANTY; without even the implied warranty of

14

# but WITHOUT ANY WARRANTY; without even the implied warranty of

15

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

15

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

16

# GNU General Public License for more details.

16

# GNU General Public License for more details.

17

#

17

#

18

# You should have received a copy of the GNU General Public License

18

# You should have received a copy of the GNU General Public License

19

# along with this program. If not, see <http://www.gnu.org/licenses/>.

19

# along with this program. If not, see <http://www.gnu.org/licenses/>.

20

#++

20

#++

21

22

23

# General module

23

# General module

24

module Mirimiri

24

module Mirimiri

25

26

# A Document is a bag of words and is constructed from a string.

26

# A Document is a bag of words and is constructed from a string.

27

class Document

27

class Document

28

attr_reader :words, :doc_content

28

attr_reader :words, :doc_content

29

30

# Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html

30

# Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html

31

# and the \\W special escape).

31

# and the \\W special escape).

32

#

32

#

33

# Protected function, only meant to by called at the initialization.

33

# Protected function, only meant to by called at the initialization.

34

def format_words

34

def format_words

35

wo = []

35

wo = []

36

37

@doc_content.split.each do |w|

37

@doc_content.split.each do |w|

38

w.split(/\W/).each do |sw|

38

w.split(/\W/).each do |sw|

39

wo.push(sw.downcase) if sw =~ /[a-zA-Z]/

39

wo.push(sw.downcase) if sw =~ /[a-zA-Z]/

40

end

40

end

41

end

41

end

42

43

wo

43

wo

44

end

44

end

45

46

# Returns an Array containing the +n+-grams (words) from the current Document.

46

# Returns an Array containing the +n+-grams (words) from the current Document.

47

#

47

#

48

# ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]

48

# ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]

49

def ngrams(n)

49

def ngrams(n)

50

window = []

50

window = []

51

ngrams_array = []

51

ngrams_array = []

52

53

@words.each do |w|

53

@words.each do |w|

54

window.push(w)

54

window.push(w)

55

if window.size == n

55

if window.size == n

56

ngrams_array.push window.join(" ")

56

ngrams_array.push window.join(" ")

57

window.delete_at(0)

57

window.delete_at(0)

58

end

58

end

59

end

59

end

60

61

ngrams_array.uniq

61

ngrams_array.uniq

62

end

62

end

63

64

# Returns a Hash containing the words and their associated counts in the current Document.

64

# Returns a Hash containing the words and their associated counts in the current Document.

65

#

65

#

66

# count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }

66

# count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }

67

def count_words

67

def count_words

68

counts = Hash.new { |h,k| h[k] = 0 }

68

counts = Hash.new { |h,k| h[k] = 0 }

69

@words.each { |w| counts[w] += 1 }

69

@words.each { |w| counts[w] += 1 }

70

71

counts

71

counts

72

end

72

end

73

74

# Computes the entropy of a given string +s+ inside the document.

74

# Computes the entropy of a given string +s+ inside the document.

75

#

75

#

76

# If the string parameter is composed of many words (i.e. tokens separated

76

# If the string parameter is composed of many words (i.e. tokens separated

77

# by whitespace(s)), it is considered as an ngram.

77

# by whitespace(s)), it is considered as an ngram.

78

#

78

#

79

# entropy("guitar") #=> 0.00432114812727959

79

# entropy("guitar") #=> 0.00432114812727959

80

# entropy("dillinger escape plan") #=> 0.265862076325102

80

# entropy("dillinger escape plan") #=> 0.265862076325102

81

def entropy(s)

81

def entropy(s)

82

en = 0.0

82

en = 0.0

83

# TODO: count_words as an attribute?

83

counts = self.count_words

84

counts = self.count_words

84

85

s.split.each do |w|

86

s.split.each do |w|

86

p_wi = counts[w].to_f/@words.count.to_f

87

p_wi = counts[w].to_f/@words.count.to_f

87

en += p_wi*Math.log2(p_wi)

88

en += p_wi*Math.log2(p_wi)

88

end

89

end

89

90

en *= -1

91

en *= -1

91

en

92

en

92

end

93

end

93

94

# Computes the term frequency of a given *word* +s+.

95

# Computes the term frequency of a given *word* +s+.

95

#

96

#

96

# tf("guitar") #=> 0.000380372765310004

97

# tf("guitar") #=> 0.000380372765310004

97

def tf(s)

98

def tf(s)

98

self.count_words[s].to_f/@words.size.to_f

99

self.count_words[s].to_f/@words.size.to_f

99

end

100

end

100

101

102

def initialize(content="")

103

def initialize(content="")

103

@doc_content = content

104

@doc_content = content

104

@words = format_words

105

@words = format_words

105

end

106

end

106

107

protected :format_words

108

protected :format_words

108

end

109

end

109

110

# A WebDocument is a Document with a +url+.

111

# A WebDocument is a Document with a +url+.

111

class WebDocument < Document

112

class WebDocument < Document

112

attr_reader :url

113

attr_reader :url

113

114

# Returns the HTML text from the page of a given +url+.

115

# Returns the HTML text from the page of a given +url+.

115

def self.get_content(url)

116

def self.get_content(url)

116

require 'net/http'

117

require 'net/http'

117

Net::HTTP.get(URI.parse(url))

118

Net::HTTP.get(URI.parse(url))

118

end

119

end

119

120

# WebDocument constructor, the content of the Document is the HTML page

121

# WebDocument constructor, the content of the Document is the HTML page

121

# without the tags.

122

# without the tags.

122

def initialize(url)

123

def initialize(url,only_tags=nil)

123

@url = url

124

@url = url

124

super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags

125

content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")

126

super content.strip_javascripts.strip_stylesheets.strip_xml_tags

125

end

127

end

126

end

128

end

127

129

128

# A WikipediaPage is a WebDocument.

130

# A WikipediaPage is a WebDocument.

129

class WikipediaPage < WebDocument

131

class WikipediaPage < WebDocument

130

require 'rexml/document'

132

require 'rexml/document'

131

require 'net/http'

133

require 'net/http'

132

require 'kconv'

134

require 'kconv'

133

135

134

136

135

def self.search_wikipedia_titles(name)

137

def self.search_wikipedia_titles(name)

136

raise ArgumentError, "Bad encoding", name unless name.isutf8

138

raise ArgumentError, "Bad encoding", name unless name.isutf8

137

139

138

res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search']

140

res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search']

139

141

140

res.collect { |e| e.attributes['title'] } unless res.nil?

142

res.collect { |e| e.attributes['title'] } unless res.nil?

141

end

143

end

142

144

143

def self.get_url(name)

145

def self.get_url(name)

144

raise ArgumentError, "Bad encoding", name unless name.isutf8

146

raise ArgumentError, "Bad encoding", name unless name.isutf8

145

147

146

atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes

148

atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes

147

149

148

atts['fullurl'] if atts['missing'].nil?

150

atts['fullurl'] if atts['missing'].nil?

149

end

151

end

150

152

151

def self.search_homepage(name)

153

def self.search_homepage(name)

152

title = WikipediaPage.search_wikipedia_titles name

154

title = WikipediaPage.search_wikipedia_titles name

153

155

154

WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?

156

WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?

155

end

157

end

156

158

157

# def initialize(name)

158

# title = WikipediaPage.search_wikipedia_titles name

159

# raise ArgumentError, "No page found" if title.empty?

160

# super WikipediaPage.get_url title[0]

161

# end

162

end

159

end

GITLAB

Deveaud Romain / mirimiri

possibility to extract only html fields contents when initializing a WebDocument

 #!/usr/bin/env ruby
 #--
 # This file is a part of the mirimiri library
 #
 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #++
 # General module
 module Mirimiri
   # A Document is a bag of words and is constructed from a string.
   class Document
     attr_reader :words, :doc_content
     # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
     # and the \\W special escape).
     #
     # Protected function, only meant to by called at the initialization.
     def format_words
       wo = []
       @doc_content.split.each do |w|
         w.split(/\W/).each do |sw|
           wo.push(sw.downcase) if sw =~ /[a-zA-Z]/
         end
       end
       wo
     end
     # Returns an Array containing the +n+-grams (words) from the current Document.
     #
     #   ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
     def ngrams(n)
       window       = []
       ngrams_array = []
       @words.each do |w|
         window.push(w)
         if window.size == n
           ngrams_array.push window.join(" ")
           window.delete_at(0)
         end
       end
       ngrams_array.uniq
     end
     # Returns a Hash containing the words and their associated counts in the current Document.
     #
     #   count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
     def count_words
       counts = Hash.new { |h,k| h[k] = 0 }
       @words.each { |w| counts[w] += 1 }
       counts
     end
     # Computes the entropy of a given string +s+ inside the document.
     #
     # If the string parameter is composed of many words (i.e. tokens separated
     # by whitespace(s)), it is considered as an ngram.
     #
     #   entropy("guitar") #=> 0.00432114812727959
     #   entropy("dillinger escape plan") #=> 0.265862076325102
     def entropy(s)
       en = 0.0
+      # TODO: count_words as an attribute?
       counts = self.count_words
       s.split.each do |w|
         p_wi = counts[w].to_f/@words.count.to_f
         en += p_wi*Math.log2(p_wi)
       end
       en *= -1
       en
     end
     # Computes the term frequency of a given *word* +s+.
     #
     #   tf("guitar") #=> 0.000380372765310004
     def tf(s)
       self.count_words[s].to_f/@words.size.to_f
     end
     def initialize(content="")
       @doc_content = content
       @words = format_words
     end
     protected :format_words
   end
   # A WebDocument is a Document with a +url+.
   class WebDocument < Document
     attr_reader :url
     # Returns the HTML text from the page of a given +url+.
     def self.get_content(url)
       require 'net/http'
       Net::HTTP.get(URI.parse(url))
     end
     # WebDocument constructor, the content of the Document is the HTML page
     # without the tags.
-    def initialize(url)
+    def initialize(url,only_tags=nil)
       @url = url
-      super WebDocument.get_content(url).strip_javascripts.strip_stylesheets.strip_xml_tags
+      content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")
+      super content.strip_javascripts.strip_stylesheets.strip_xml_tags
     end
   end
   # A WikipediaPage is a WebDocument.
   class WikipediaPage < WebDocument
     require 'rexml/document'
     require 'net/http'
     require 'kconv'
     def self.search_wikipedia_titles(name)
       raise ArgumentError, "Bad encoding", name unless name.isutf8
       res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search']
       res.collect { |e| e.attributes['title'] } unless res.nil?
     end
     def self.get_url(name)
       raise ArgumentError, "Bad encoding", name unless name.isutf8
       atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes
       atts['fullurl'] if atts['missing'].nil?
     end
     def self.search_homepage(name)
       title = WikipediaPage.search_wikipedia_titles name
       WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?
     end
-#    def initialize(name)
-#      title = WikipediaPage.search_wikipedia_titles name
-#      raise ArgumentError, "No page found" if title.empty?
-#      super WikipediaPage.get_url title[0]
-#    end
   end