Deveaud Romain / mirimiri

1

#!/usr/bin/env ruby

1

#!/usr/bin/env ruby

2

3

#--

3

#--

4

# This file is a part of the mirimiri library

4

# This file is a part of the mirimiri library

5

#

5

#

6

7

#

7

#

8

# This program is free software: you can redistribute it and/or modify

8

# This program is free software: you can redistribute it and/or modify

9

# it under the terms of the GNU General Public License as published by

9

# it under the terms of the GNU General Public License as published by

10

# the Free Software Foundation, either version 3 of the License, or

10

# the Free Software Foundation, either version 3 of the License, or

11

# (at your option) any later version.

11

# (at your option) any later version.

12

#

12

#

13

# This program is distributed in the hope that it will be useful,

13

# This program is distributed in the hope that it will be useful,

14

# but WITHOUT ANY WARRANTY; without even the implied warranty of

14

# but WITHOUT ANY WARRANTY; without even the implied warranty of

15

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

15

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

16

# GNU General Public License for more details.

16

# GNU General Public License for more details.

17

#

17

#

18

# You should have received a copy of the GNU General Public License

18

# You should have received a copy of the GNU General Public License

19

# along with this program. If not, see <http://www.gnu.org/licenses/>.

19

# along with this program. If not, see <http://www.gnu.org/licenses/>.

20

#++

20

#++

21

22

23

# General module

23

# General module

24

module Mirimiri

24

module Mirimiri

25

26

# A Document is a bag of words and is constructed from a string.

26

# A Document is a bag of words and is constructed from a string.

27

class Document

27

class Document

28

attr_reader :words, :doc_content

28

attr_reader :words, :doc_content

29

30

# Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html

30

# Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html

31

# and the \\W special escape).

31

# and the \\W special escape).

32

#

32

#

33

# Protected function, only meant to by called at the initialization.

33

# Protected function, only meant to by called at the initialization.

34

def format_words

34

def format_words

35

wo = []

35

wo = []

36

37

@doc_content.split.each do |w|

37

@doc_content.split.each do |w|

38

w.split(/\W/).each do |sw|

38

w.split(/\W/).each do |sw|

39

wo.push(sw.downcase) if sw =~ /[a-zA-Z]/

39

wo.push(sw.downcase) if sw =~ /[a-zA-Z]/

40

end

40

end

41

end

41

end

42

43

wo

43

wo

44

end

44

end

45

46

# Returns an Array containing the +n+-grams (words) from the current Document.

46

# Returns an Array containing the +n+-grams (words) from the current Document.

47

#

47

#

48

# ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]

48

# ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]

49

def ngrams(n)

49

def ngrams(n)

50

window = []

50

window = []

51

ngrams_array = []

51

ngrams_array = []

52

53

@words.each do |w|

53

@words.each do |w|

54

window.push(w)

54

window.push(w)

55

if window.size == n

55

if window.size == n

56

ngrams_array.push window.join(" ")

56

ngrams_array.push window.join(" ")

57

window.delete_at(0)

57

window.delete_at(0)

58

end

58

end

59

end

59

end

60

61

ngrams_array.uniq

61

ngrams_array.uniq

62

end

62

end

63

64

# Returns a Hash containing the words and their associated counts in the current Document.

64

# Returns a Hash containing the words and their associated counts in the current Document.

65

#

65

#

66

# count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }

66

# count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }

67

def count_words

67

def count_words

68

counts = Hash.new { |h,k| h[k] = 0 }

68

counts = Hash.new { |h,k| h[k] = 0 }

69

@words.each { |w| counts[w] += 1 }

69

@words.each { |w| counts[w] += 1 }

70

71

counts

71

counts

72

end

72

end

73

74

# Computes the entropy of a given string +s+ inside the document.

74

# Computes the entropy of a given string +s+ inside the document.

75

#

75

#

76

# If the string parameter is composed of many words (i.e. tokens separated

76

# If the string parameter is composed of many words (i.e. tokens separated

77

# by whitespace(s)), it is considered as an ngram.

77

# by whitespace(s)), it is considered as an ngram.

78

#

78

#

79

# entropy("guitar") #=> 0.00432114812727959

79

# entropy("guitar") #=> 0.00432114812727959

80

# entropy("dillinger escape plan") #=> 0.265862076325102

80

# entropy("dillinger escape plan") #=> 0.265862076325102

81

def entropy(s)

81

def entropy(s)

82

en = 0.0

82

en = 0.0

83

# TODO: count_words as an attribute?

84

counts = self.count_words

85

83

86

s.split.each do |w|

84

s.split.each do |w|

87

p_wi = counts[w].to_f/@words.count.to_f

85

p_wi = @count_words[w].to_f/@words.count.to_f

88

en += p_wi*Math.log2(p_wi)

86

en += p_wi*Math.log2(p_wi)

89

end

87

end

90

88

91

en *= -1

89

en *= -1

92

en

90

en

93

end

91

end

94

92

95

# Computes the term frequency of a given *word* +s+.

93

# Computes the term frequency of a given *word* +s+.

96

#

94

#

97

# tf("guitar") #=> 0.000380372765310004

95

# tf("guitar") #=> 0.000380372765310004

98

def tf(s)

96

def tf(s)

99

self.count_words[s].to_f/@words.size.to_f

97

@count_words[s].to_f/@words.size.to_f

100

end

98

end

101

99

102

100

103

def initialize(content="")

101

def initialize(content="")

104

@doc_content = content

102

@doc_content = content

105

@words = format_words

103

@words = format_words

104

@count_words = count_words

106

end

105

end

107

106

108

protected :format_words

107

protected :format_words, :count_words

109

end

108

end

110

109

111

# A WebDocument is a Document with a +url+.

110

# A WebDocument is a Document with a +url+.

112

class WebDocument < Document

111

class WebDocument < Document

113

attr_reader :url

112

attr_reader :url

114

113

115

# Returns the HTML text from the page of a given +url+.

114

# Returns the HTML text from the page of a given +url+.

116

def self.get_content(url)

115

def self.get_content(url)

117

require 'net/http'

116

require 'net/http'

118

Net::HTTP.get(URI.parse(url))

117

Net::HTTP.get(URI.parse(url))

119

end

118

end

120

119

121

# WebDocument constructor, the content of the Document is the HTML page

120

# WebDocument constructor, the content of the Document is the HTML page

122

# without the tags.

121

# without the tags.

123

def initialize(url,only_tags=nil)

122

def initialize(url,only_tags=nil)

124

@url = url

123

@url = url

125

content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")

124

content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")

126

super content.strip_javascripts.strip_stylesheets.strip_xml_tags

125

super content.strip_javascripts.strip_stylesheets.strip_xml_tags

127

end

126

end

128

end

127

end

129

128

130

# A WikipediaPage is a WebDocument.

129

# A WikipediaPage is a WebDocument.

131

class WikipediaPage < WebDocument

130

class WikipediaPage < WebDocument

132

require 'rexml/document'

131

require 'rexml/document'

133

require 'net/http'

132

require 'net/http'

134

require 'kconv'

133

require 'kconv'

135

134

136

135

137

def self.search_wikipedia_titles(name)

136

def self.search_wikipedia_titles(name)

138

raise ArgumentError, "Bad encoding", name unless name.isutf8

137

raise ArgumentError, "Bad encoding", name unless name.isutf8

139

138

140

res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search']

139

res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search']

141

140

142

res.collect { |e| e.attributes['title'] } unless res.nil?

141

res.collect { |e| e.attributes['title'] } unless res.nil?

143

end

142

end

144

143

145

def self.get_url(name)

144

def self.get_url(name)

146

raise ArgumentError, "Bad encoding", name unless name.isutf8

145

raise ArgumentError, "Bad encoding", name unless name.isutf8

147

146

148

atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes

147

atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes

149

148

150

atts['fullurl'] if atts['missing'].nil?

149

atts['fullurl'] if atts['missing'].nil?

151

end

150

end

152

151

153

def self.search_homepage(name)

152

def self.search_homepage(name)

154

title = WikipediaPage.search_wikipedia_titles name

153

title = WikipediaPage.search_wikipedia_titles name

155

154

156

WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?

155

WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?

157

end

156

end

158

157

159

end

158

end

160

end

159

end

GITLAB

Deveaud Romain / mirimiri

calls to the count_words method of Document are no more allowed

 #!/usr/bin/env ruby
 #--
 # This file is a part of the mirimiri library
 #
 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #++
 # General module
 module Mirimiri
   # A Document is a bag of words and is constructed from a string.
   class Document
     attr_reader :words, :doc_content
     # Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html
     # and the \\W special escape).
     #
     # Protected function, only meant to by called at the initialization.
     def format_words
       wo = []
       @doc_content.split.each do |w|
         w.split(/\W/).each do |sw|
           wo.push(sw.downcase) if sw =~ /[a-zA-Z]/
         end
       end
       wo
     end
     # Returns an Array containing the +n+-grams (words) from the current Document.
     #
     #   ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]
     def ngrams(n)
       window       = []
       ngrams_array = []
       @words.each do |w|
         window.push(w)
         if window.size == n
           ngrams_array.push window.join(" ")
           window.delete_at(0)
         end
       end
       ngrams_array.uniq
     end
     # Returns a Hash containing the words and their associated counts in the current Document.
     #
     #   count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }
     def count_words
       counts = Hash.new { |h,k| h[k] = 0 }
       @words.each { |w| counts[w] += 1 }
       counts
     end
     # Computes the entropy of a given string +s+ inside the document.
     #
     # If the string parameter is composed of many words (i.e. tokens separated
     # by whitespace(s)), it is considered as an ngram.
     #
     #   entropy("guitar") #=> 0.00432114812727959
     #   entropy("dillinger escape plan") #=> 0.265862076325102
     def entropy(s)
       en = 0.0
-      # TODO: count_words as an attribute?
-      counts = self.count_words
       s.split.each do |w|
-        p_wi = counts[w].to_f/@words.count.to_f
+        p_wi = @count_words[w].to_f/@words.count.to_f
         en += p_wi*Math.log2(p_wi)
       end
       en *= -1
       en
     end
     # Computes the term frequency of a given *word* +s+.
     #
     #   tf("guitar") #=> 0.000380372765310004
     def tf(s)
-      self.count_words[s].to_f/@words.size.to_f
+      @count_words[s].to_f/@words.size.to_f
     end
     def initialize(content="")
       @doc_content = content
       @words = format_words
+      @count_words = count_words
     end
-    protected :format_words
+    protected :format_words, :count_words
   end
   # A WebDocument is a Document with a +url+.
   class WebDocument < Document
     attr_reader :url
     # Returns the HTML text from the page of a given +url+.
     def self.get_content(url)
       require 'net/http'
       Net::HTTP.get(URI.parse(url))
     end
     # WebDocument constructor, the content of the Document is the HTML page
     # without the tags.
     def initialize(url,only_tags=nil)
       @url = url
       content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")
       super content.strip_javascripts.strip_stylesheets.strip_xml_tags
     end
   end
   # A WikipediaPage is a WebDocument.
   class WikipediaPage < WebDocument
     require 'rexml/document'
     require 'net/http'
     require 'kconv'
     def self.search_wikipedia_titles(name)
       raise ArgumentError, "Bad encoding", name unless name.isutf8
       res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search']
       res.collect { |e| e.attributes['title'] } unless res.nil?
     end
     def self.get_url(name)
       raise ArgumentError, "Bad encoding", name unless name.isutf8
       atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes
       atts['fullurl'] if atts['missing'].nil?
     end
     def self.search_homepage(name)
       title = WikipediaPage.search_wikipedia_titles name
       WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?
     end
   end
 end