Deveaud Romain / mirimiri

1

#!/usr/bin/env ruby

1

#!/usr/bin/env ruby

2

3

#--

3

#--

4

# This file is a part of the mirimiri library

4

# This file is a part of the mirimiri library

5

#

5

#

6

7

#

7

#

8

# This program is free software: you can redistribute it and/or modify

8

# This program is free software: you can redistribute it and/or modify

9

# it under the terms of the GNU General Public License as published by

9

# it under the terms of the GNU General Public License as published by

10

# the Free Software Foundation, either version 3 of the License, or

10

# the Free Software Foundation, either version 3 of the License, or

11

# (at your option) any later version.

11

# (at your option) any later version.

12

#

12

#

13

# This program is distributed in the hope that it will be useful,

13

# This program is distributed in the hope that it will be useful,

14

# but WITHOUT ANY WARRANTY; without even the implied warranty of

14

# but WITHOUT ANY WARRANTY; without even the implied warranty of

15

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

15

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

16

# GNU General Public License for more details.

16

# GNU General Public License for more details.

17

#

17

#

18

# You should have received a copy of the GNU General Public License

18

# You should have received a copy of the GNU General Public License

19

# along with this program. If not, see <http://www.gnu.org/licenses/>.

19

# along with this program. If not, see <http://www.gnu.org/licenses/>.

20

#++

20

#++

21

22

23

# General module

23

# General module

24

module Mirimiri

24

module Mirimiri

25

26

# A Document is a bag of words and is constructed from a string.

26

# A Document is a bag of words and is constructed from a string.

27

class Document

27

class Document

28

attr_reader :words, :doc_content, :count_words

28

attr_reader :words, :doc_content, :count_words

29

30

# Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html

30

# Any non-word characters are removed from the words (see http://perldoc.perl.org/perlre.html

31

# and the \\W special escape).

31

# and the \\W special escape).

32

#

32

#

33

# Protected function, only meant to by called at the initialization.

33

# Protected function, only meant to by called at the initialization.

34

def format_words

34

def format_words

35

wo = []

35

wo = []

36

37

@doc_content.split.each do |w|

37

@doc_content.split.each do |w|

38

w.split(/\W/).each do |sw|

38

w.split(/\W/).each do |sw|

39

wo.push(sw.downcase) if sw =~ /[a-zA-Z]/

39

wo.push(sw.downcase) if sw =~ /[a-zA-Z]/

40

end

40

end

41

end

41

end

42

43

wo

43

wo

44

end

44

end

45

46

# Returns an Array containing the +n+-grams (words) from the current Document.

46

# Returns an Array containing the +n+-grams (words) from the current Document.

47

#

47

#

48

# ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]

48

# ngrams(2) #=> ["the free", "free encyclopedia", "encyclopedia var", "var skin", ...]

49

def ngrams(n)

49

def ngrams(n)

50

window = []

50

window = []

51

ngrams_array = []

51

ngrams_array = []

52

53

@words.each do |w|

53

@words.each do |w|

54

window.push(w)

54

window.push(w)

55

if window.size == n

55

if window.size == n

56

ngrams_array.push window.join(" ")

56

ngrams_array.push window.join(" ")

57

window.delete_at(0)

57

window.delete_at(0)

58

end

58

end

59

end

59

end

60

61

ngrams_array.uniq

61

ngrams_array.uniq

62

end

62

end

63

64

# Returns a Hash containing the words and their associated counts in the current Document.

64

# Returns a Hash containing the words and their associated counts in the current Document.

65

#

65

#

66

# count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }

66

# count_words #=> { "guitar"=>1, "bass"=>3, "album"=>20, ... }

67

def count_words

67

def count_words

68

counts = Hash.new { |h,k| h[k] = 0 }

68

counts = Hash.new { |h,k| h[k] = 0 }

69

@words.each { |w| counts[w] += 1 }

69

@words.each { |w| counts[w] += 1 }

70

71

counts

71

counts

72

end

72

end

73

74

# Computes the entropy of a given string +s+ inside the document.

74

# Computes the entropy of a given string +s+ inside the document.

75

#

75

#

76

# If the string parameter is composed of many words (i.e. tokens separated

76

# If the string parameter is composed of many words (i.e. tokens separated

77

# by whitespace(s)), it is considered as an ngram.

77

# by whitespace(s)), it is considered as an ngram.

78

#

78

#

79

# entropy("guitar") #=> 0.00432114812727959

79

# entropy("guitar") #=> 0.00432114812727959

80

# entropy("dillinger escape plan") #=> 0.265862076325102

80

# entropy("dillinger escape plan") #=> 0.265862076325102

81

def entropy(s)

81

def entropy(s)

82

en = 0.0

82

en = 0.0

83

84

s.split.each do |w|

84

s.split.each do |w|

85

p_wi = @count_words[w].to_f/@words.count.to_f

85

p_wi = @count_words[w].to_f/@words.count.to_f

86

en += p_wi*Math.log2(p_wi)

86

en += p_wi*Math.log2(p_wi)

87

end

87

end

88

89

en *= -1

89

en *= -1

90

en

90

en

91

end

91

end

92

93

# Computes the term frequency of a given *word* +s+.

93

# Computes the term frequency of a given *word* +s+.

94

#

94

#

95

# tf("guitar") #=> 0.000380372765310004

95

# tf("guitar") #=> 0.000380372765310004

96

def tf(s)

96

def tf(s)

97

@count_words[s].to_f/@words.size.to_f

97

@count_words[s].to_f/@words.size.to_f

98

end

98

end

99

100

101

def initialize(content="")

101

def initialize(content="")

102

@doc_content = content

102

@doc_content = content

103

@words = format_words

103

@words = format_words

104

@count_words = count_words

104

@count_words = count_words

105

end

105

end

106

107

protected :format_words, :count_words

107

protected :format_words, :count_words

108

end

108

end

109

110

# A WebDocument is a Document with a +url+.

110

# A WebDocument is a Document with a +url+.

111

class WebDocument < Document

111

class WebDocument < Document

112

attr_reader :url

112

attr_reader :url

113

114

# Returns the HTML text from the page of a given +url+.

114

# Returns the HTML text from the page of a given +url+.

115

def self.get_content(url)

115

def self.get_content(url)

116

require 'net/http'

116

require 'net/http'

117

Net::HTTP.get(URI.parse(url))

117

Net::HTTP.get(URI.parse(url))

118

end

118

end

119

120

# WebDocument constructor, the content of the Document is the HTML page

120

# WebDocument constructor, the content of the Document is the HTML page

121

# without the tags.

121

# without the tags.

122

def initialize(url,only_tags=nil)

122

def initialize(url,only_tags=nil)

123

@url = url

123

@url = url

124

content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")

124

content = only_tags.nil? ? WebDocument.get_content(url) : WebDocument.get_content(url).extract_xmltags_values(only_tags).join("")

125

super content.strip_javascripts.strip_stylesheets.strip_xml_tags

125

super content.strip_javascripts.strip_xml_tags

126

end

126

end

127

end

127

end

128

129

# A WikipediaPage is a WebDocument.

129

# A WikipediaPage is a WebDocument.

130

class WikipediaPage < WebDocument

130

class WikipediaPage < WebDocument

131

require 'rexml/document'

131

require 'rexml/document'

132

require 'net/http'

132

require 'net/http'

133

require 'kconv'

133

require 'kconv'

134

135

136

def self.search_wikipedia_titles(name)

136

def self.search_wikipedia_titles(name)

137

raise ArgumentError, "Bad encoding", name unless name.isutf8

137

raise ArgumentError, "Bad encoding", name unless name.isutf8

138

139

res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).toutf8).elements['api/query/search']

139

res = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&list=search&srsearch=#{URI.escape name}&format=xml" ).unaccent.toutf8).elements['api/query/search']

140

141

res.collect { |e| e.attributes['title'] } unless res.nil?

141

res.collect { |e| e.attributes['title'] } unless res.nil?

142

end

142

end

143

144

def self.get_url(name)

144

def self.get_url(name)

145

raise ArgumentError, "Bad encoding", name unless name.isutf8

145

raise ArgumentError, "Bad encoding", name unless name.isutf8

146

147

atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).toutf8).elements['api/query/pages/page'].attributes

147

atts = REXML::Document.new(Net::HTTP.get( URI.parse "http://en.wikipedia.org/w/api.php?action=query&titles=#{URI.escape name}&inprop=url&prop=info&format=xml" ).unaccent.toutf8).elements['api/query/pages/page'].attributes

148

149

atts['fullurl'] if atts['missing'].nil?

149

atts['fullurl'] if atts['missing'].nil?

150

end

150

end

151

152

def self.search_homepage(name)

152

def self.search_homepage(name)

153

title = WikipediaPage.search_wikipedia_titles name

153

title = WikipediaPage.search_wikipedia_titles name

154

155

WikipediaPage.new(WikipediaPage.get_url title[0]) unless title.nil? || title.empty?

155

WikipediaPage.get_url(title[0]) unless title.nil? || title.empty?

156

end

156

end

157

158

end

158

end

159

end

159

end

160

GITLAB

Deveaud Romain / mirimiri

resolving encoding problems