Deveaud Romain / mirimiri

1

#!/usr/bin/env ruby

1

#!/usr/bin/env ruby

2

3

#--

3

#--

4

# This file is a part of the mirimiri library

4

# This file is a part of the mirimiri library

5

#

5

#

6

7

#

7

#

8

# This program is free software: you can redistribute it and/or modify

8

# This program is free software: you can redistribute it and/or modify

9

# it under the terms of the GNU General Public License as published by

9

# it under the terms of the GNU General Public License as published by

10

# the Free Software Foundation, either version 3 of the License, or

10

# the Free Software Foundation, either version 3 of the License, or

11

# (at your option) any later version.

11

# (at your option) any later version.

12

#

12

#

13

# This program is distributed in the hope that it will be useful,

13

# This program is distributed in the hope that it will be useful,

14

# but WITHOUT ANY WARRANTY; without even the implied warranty of

14

# but WITHOUT ANY WARRANTY; without even the implied warranty of

15

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

15

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the

16

# GNU General Public License for more details.

16

# GNU General Public License for more details.

17

#

17

#

18

# You should have received a copy of the GNU General Public License

18

# You should have received a copy of the GNU General Public License

19

# along with this program. If not, see <http://www.gnu.org/licenses/>.

19

# along with this program. If not, see <http://www.gnu.org/licenses/>.

20

#++

20

#++

21

22

module Mirimiri

22

module Mirimiri

23

24

# These are the default stopwords provided by Lemur.

24

# These are the default stopwords provided by Lemur.

25

Stoplist = [

25

Stoplist = [

26

"a","about","above","according","across","after","afterwards","again","against",

26

"a","about","above","according","across","after","afterwards","again","against",

27

"albeit","all","almost","alone","along","already","also","although","always","am",

27

"albeit","all","almost","alone","along","already","also","although","always","am",

28

"among","amongst","an","and","another","any","anybody","anyhow","anyone","anything",

28

"among","amongst","an","and","another","any","anybody","anyhow","anyone","anything",

29

"anyway","anywhere","apart","are","around","as","at","av","be","became","because",

29

"anyway","anywhere","apart","are","around","as","at","av","be","became","because",

30

"become","becomes","becoming","been","before","beforehand","behind","being","below",

30

"become","becomes","becoming","been","before","beforehand","behind","being","below",

31

"beside","besides","between","beyond","both","but","by","can","cannot","canst",

31

"beside","besides","between","beyond","both","but","by","can","cannot","canst",

32

"certain","cf","choose","contrariwise","cos","could","cu","day","do","does","doesn't",

32

"certain","cf","choose","contrariwise","cos","could","cu","day","do","does","doesn't",

33

"doing","dost","doth","double","down","dual","during","each","either","else",

33

"doing","dost","doth","double","down","dual","during","each","either","else",

34

"elsewhere","enough","et","etc","even","ever","every","everybody","everyone",

34

"elsewhere","enough","et","etc","even","ever","every","everybody","everyone",

35

"everything","everywhere","except","excepted","excepting","exception","exclude",

35

"everything","everywhere","except","excepted","excepting","exception","exclude",

36

"excluding","exclusive","far","farther","farthest","few","ff","first","for",

36

"excluding","exclusive","far","farther","farthest","few","ff","first","for",

37

"formerly","forth","forward","from","front","further","furthermore","furthest","get",

37

"formerly","forth","forward","from","front","further","furthermore","furthest","get",

38

"go","had","halves","hardly","has","hast","hath","have","he","hence","henceforth",

38

"go","had","halves","hardly","has","hast","hath","have","he","hence","henceforth",

39

"her","here","hereabouts","hereafter","hereby","herein","hereto","hereupon","hers",

39

"her","here","hereabouts","hereafter","hereby","herein","hereto","hereupon","hers",

40

"herself","him","himself","hindmost","his","hither","hitherto","how","however",

40

"herself","him","himself","hindmost","his","hither","hitherto","how","however",

41

"howsoever","i","ie","if","in","inasmuch","inc","include","included","including",

41

"howsoever","i","ie","if","in","inasmuch","inc","include","included","including",

42

"indeed","indoors","inside","insomuch","instead","into","inward","inwards","is",

42

"indeed","indoors","inside","insomuch","instead","into","inward","inwards","is",

43

"it","its","itself","just","kind","kg","km","last","latter","latterly","less","lest",

43

"it","its","itself","just","kind","kg","km","last","latter","latterly","less","lest",

44

"let","like","little","ltd","many","may","maybe","me","meantime","meanwhile","might",

44

"let","like","little","ltd","many","may","maybe","me","meantime","meanwhile","might",

45

"moreover","most","mostly","more","mr","mrs","ms","much","must","my","myself",

45

"moreover","most","mostly","more","mr","mrs","ms","much","must","my","myself",

46

"namely","need","neither","never","nevertheless","next","no","nobody","none",

46

"namely","need","neither","never","nevertheless","next","no","nobody","none",

47

"nonetheless","noone","nope","nor","not","nothing","notwithstanding","now","nowadays",

47

"nonetheless","noone","nope","nor","not","nothing","notwithstanding","now","nowadays",

48

"nowhere","of","off","often","ok","on","once","one","only","onto","or","other",

48

"nowhere","of","off","often","ok","on","once","one","only","onto","or","other",

49

"others","otherwise","ought","our","ours","ourselves","out","outside","over","own",

49

"others","otherwise","ought","our","ours","ourselves","out","outside","over","own",

50

"per","perhaps","plenty","provide","quite","rather","really","round","said","sake",

50

"per","perhaps","plenty","provide","quite","rather","really","round","said","sake",

51

"same","sang","save","saw","see","seeing","seem","seemed","seeming","seems","seen",

51

"same","sang","save","saw","see","seeing","seem","seemed","seeming","seems","seen",

52

"seldom","selves","sent","several","shalt","she","should","shown","sideways","since",

52

"seldom","selves","sent","several","shalt","she","should","shown","sideways","since",

53

"slept","slew","slung","slunk","smote","so","some","somebody","somehow","someone",

53

"slept","slew","slung","slunk","smote","so","some","somebody","somehow","someone",

54

"something","sometime","sometimes","somewhat","somewhere","spake","spat","spoke",

54

"something","sometime","sometimes","somewhat","somewhere","spake","spat","spoke",

55

"spoken","sprang","sprung","stave","staves","still","such","supposing","than","that",

55

"spoken","sprang","sprung","stave","staves","still","such","supposing","than","that",

56

"the","thee","their","them","themselves","then","thence","thenceforth","there",

56

"the","thee","their","them","themselves","then","thence","thenceforth","there",

57

"thereabout","thereabouts","thereafter","thereby","therefore","therein","thereof",

57

"thereabout","thereabouts","thereafter","thereby","therefore","therein","thereof",

58

"thereon","thereto","thereupon","these","they","this","those","thou","though",

58

"thereon","thereto","thereupon","these","they","this","those","thou","though",

59

"thrice","through","throughout","thru","thus","thy","thyself","till","to","together",

59

"thrice","through","throughout","thru","thus","thy","thyself","till","to","together",

60

"too","toward","towards","ugh","unable","under","underneath","unless","unlike",

60

"too","toward","towards","ugh","unable","under","underneath","unless","unlike",

61

"until","up","upon","upward","upwards","us","use","used","using","very","via","vs",

61

"until","up","upon","upward","upwards","us","use","used","using","very","via","vs",

62

"want","was","we","week","well","were","what","whatever","whatsoever","when","whence",

62

"want","was","we","week","well","were","what","whatever","whatsoever","when","whence",

63

"whenever","whensoever","where","whereabouts","whereafter","whereas","whereat",

63

"whenever","whensoever","where","whereabouts","whereafter","whereas","whereat",

64

"whereby","wherefore","wherefrom","wherein","whereinto","whereof","whereon",

64

"whereby","wherefore","wherefrom","wherein","whereinto","whereof","whereon",

65

"wheresoever","whereto","whereunto","whereupon","wherever","wherewith","whether",

65

"wheresoever","whereto","whereunto","whereupon","wherever","wherewith","whether",

66

"whew","which","whichever","whichsoever","while","whilst","whither","who","whoa",

66

"whew","which","whichever","whichsoever","while","whilst","whither","who","whoa",

67

"whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will",

67

"whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will",

68

"wilt","with","within","without","worse","worst","would","wow","ye","yet","year",

68

"wilt","with","within","without","worse","worst","would","wow","ye","yet","year",

69

"yippee","you","your","yours","yourself","yourselves"

69

"yippee","you","your","yours","yourself","yourselves",

70

"edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en"

70

]

71

]

71

72

Transmap = {

73

Transmap = {

73

"\xC3\x80" => "A", "\xC3\x81" => "A", "\xC3\x82" => "A", "\xC3\x83" => "A",

74

"\xC3\x80" => "A", "\xC3\x81" => "A", "\xC3\x82" => "A", "\xC3\x83" => "A",

74

"\xC3\x84" => "A", "\xC3\x85" => "A", "\xC3\x86" => "AE","\xC3\x87" => "C",

75

"\xC3\x84" => "A", "\xC3\x85" => "A", "\xC3\x86" => "AE","\xC3\x87" => "C",

75

"\xC3\x88" => "E", "\xC3\x89" => "E", "\xC3\x8A" => "E", "\xC3\x8B" => "E",

76

"\xC3\x88" => "E", "\xC3\x89" => "E", "\xC3\x8A" => "E", "\xC3\x8B" => "E",

76

"\xC3\x8C" => "I", "\xC3\x8D" => "I", "\xC3\x8E" => "I", "\xC3\x8F" => "I",

77

"\xC3\x8C" => "I", "\xC3\x8D" => "I", "\xC3\x8E" => "I", "\xC3\x8F" => "I",

77

"\xC3\x90" => "D", "\xC3\x91" => "N", "\xC3\x92" => "O", "\xC3\x93" => "O",

78

"\xC3\x90" => "D", "\xC3\x91" => "N", "\xC3\x92" => "O", "\xC3\x93" => "O",

78

"\xC3\x94" => "O", "\xC3\x95" => "O", "\xC3\x96" => "O", "\xC3\x98" => "O",

79

"\xC3\x94" => "O", "\xC3\x95" => "O", "\xC3\x96" => "O", "\xC3\x98" => "O",

79

"\xC3\x99" => "U", "\xC3\x9A" => "U", "\xC3\x9B" => "U", "\xC3\x9C" => "U",

80

"\xC3\x99" => "U", "\xC3\x9A" => "U", "\xC3\x9B" => "U", "\xC3\x9C" => "U",

80

"\xC3\x9D" => "Y", "\xC3\x9E" => "P", "\xC3\x9F" => "ss",

81

"\xC3\x9D" => "Y", "\xC3\x9E" => "P", "\xC3\x9F" => "ss",

81

"\xC3\xA0" => "a", "\xC3\xA1" => "a", "\xC3\xA2" => "a", "\xC3\xA3" => "a",

82

"\xC3\xA0" => "a", "\xC3\xA1" => "a", "\xC3\xA2" => "a", "\xC3\xA3" => "a",

82

"\xC3\xA4" => "a", "\xC3\xA5" => "a", "\xC3\xA6" => "ae","\xC3\xA7" => "c",

83

"\xC3\xA4" => "a", "\xC3\xA5" => "a", "\xC3\xA6" => "ae","\xC3\xA7" => "c",

83

"\xC3\xA8" => "e", "\xC3\xA9" => "e", "\xC3\xAA" => "e", "\xC3\xAB" => "e",

84

"\xC3\xA8" => "e", "\xC3\xA9" => "e", "\xC3\xAA" => "e", "\xC3\xAB" => "e",

84

"\xC3\xAC" => "i", "\xC3\xAD" => "i", "\xC3\xAE" => "i", "\xC3\xAF" => "i",

85

"\xC3\xAC" => "i", "\xC3\xAD" => "i", "\xC3\xAE" => "i", "\xC3\xAF" => "i",

85

"\xC3\xB0" => "o", "\xC3\xB1" => "n", "\xC3\xB2" => "o", "\xC3\xB3" => "o",

86

"\xC3\xB0" => "o", "\xC3\xB1" => "n", "\xC3\xB2" => "o", "\xC3\xB3" => "o",

86

"\xC3\xB4" => "o", "\xC3\xB5" => "o", "\xC3\xB6" => "o", "\xC3\xB8" => "o",

87

"\xC3\xB4" => "o", "\xC3\xB5" => "o", "\xC3\xB6" => "o", "\xC3\xB8" => "o",

87

"\xC3\xB9" => "u", "\xC3\xBA" => "u", "\xC3\xBB" => "u", "\xC3\xBC" => "u",

88

"\xC3\xB9" => "u", "\xC3\xBA" => "u", "\xC3\xBB" => "u", "\xC3\xBC" => "u",

88

"\xC3\xBD" => "y", "\xC3\xBE" => "p", "\xC3\xBF" => "y",

89

"\xC3\xBD" => "y", "\xC3\xBE" => "p", "\xC3\xBF" => "y",

89

"\xC4\x80" => "A", "\xC4\x81" => "a", "\xC4\x82" => "A", "\xC4\x83" => "a",

90

"\xC4\x80" => "A", "\xC4\x81" => "a", "\xC4\x82" => "A", "\xC4\x83" => "a",

90

"\xC4\x84" => "A", "\xC4\x85" => "a", "\xC4\x86" => "C", "\xC4\x87" => "c",

91

"\xC4\x84" => "A", "\xC4\x85" => "a", "\xC4\x86" => "C", "\xC4\x87" => "c",

91

"\xC4\x88" => "C", "\xC4\x89" => "c", "\xC4\x8A" => "C", "\xC4\x8B" => "c",

92

"\xC4\x88" => "C", "\xC4\x89" => "c", "\xC4\x8A" => "C", "\xC4\x8B" => "c",

92

"\xC4\x8C" => "C", "\xC4\x8D" => "c", "\xC4\x8E" => "D", "\xC4\x8F" => "d",

93

"\xC4\x8C" => "C", "\xC4\x8D" => "c", "\xC4\x8E" => "D", "\xC4\x8F" => "d",

93

"\xC4\x90" => "D", "\xC4\x91" => "d", "\xC4\x92" => "E", "\xC4\x93" => "e",

94

"\xC4\x90" => "D", "\xC4\x91" => "d", "\xC4\x92" => "E", "\xC4\x93" => "e",

94

"\xC4\x94" => "E", "\xC4\x95" => "e", "\xC4\x96" => "E", "\xC4\x97" => "e",

95

"\xC4\x94" => "E", "\xC4\x95" => "e", "\xC4\x96" => "E", "\xC4\x97" => "e",

95

"\xC4\x98" => "E", "\xC4\x99" => "e", "\xC4\x9A" => "E", "\xC4\x9B" => "e",

96

"\xC4\x98" => "E", "\xC4\x99" => "e", "\xC4\x9A" => "E", "\xC4\x9B" => "e",

96

"\xC4\x9C" => "G", "\xC4\x9D" => "g", "\xC4\x9E" => "G", "\xC4\x9F" => "g",

97

"\xC4\x9C" => "G", "\xC4\x9D" => "g", "\xC4\x9E" => "G", "\xC4\x9F" => "g",

97

"\xC4\xA0" => "G", "\xC4\xA1" => "g", "\xC4\xA2" => "G", "\xC4\xA3" => "g",

98

"\xC4\xA0" => "G", "\xC4\xA1" => "g", "\xC4\xA2" => "G", "\xC4\xA3" => "g",

98

"\xC4\xA4" => "H", "\xC4\xA5" => "h", "\xC4\xA6" => "H", "\xC4\xA7" => "h",

99

"\xC4\xA4" => "H", "\xC4\xA5" => "h", "\xC4\xA6" => "H", "\xC4\xA7" => "h",

99

"\xC4\xA8" => "I", "\xC4\xA9" => "i", "\xC4\xAA" => "I", "\xC4\xAB" => "i",

100

"\xC4\xA8" => "I", "\xC4\xA9" => "i", "\xC4\xAA" => "I", "\xC4\xAB" => "i",

100

"\xC4\xAC" => "I", "\xC4\xAD" => "i", "\xC4\xAE" => "I", "\xC4\xAF" => "i",

101

"\xC4\xAC" => "I", "\xC4\xAD" => "i", "\xC4\xAE" => "I", "\xC4\xAF" => "i",

101

"\xC4\xB0" => "I", "\xC4\xB1" => "i", "\xC4\xB2" => "IJ","\xC4\xB3" => "ij",

102

"\xC4\xB0" => "I", "\xC4\xB1" => "i", "\xC4\xB2" => "IJ","\xC4\xB3" => "ij",

102

"\xC4\xB4" => "J", "\xC4\xB5" => "j", "\xC4\xB6" => "K", "\xC4\xB7" => "k",

103

"\xC4\xB4" => "J", "\xC4\xB5" => "j", "\xC4\xB6" => "K", "\xC4\xB7" => "k",

103

"\xC4\xB8" => "k", "\xC4\xB9" => "L", "\xC4\xBA" => "l", "\xC4\xBB" => "L",

104

"\xC4\xB8" => "k", "\xC4\xB9" => "L", "\xC4\xBA" => "l", "\xC4\xBB" => "L",

104

"\xC4\xBC" => "l", "\xC4\xBD" => "L", "\xC4\xBE" => "l", "\xC4\xBF" => "L",

105

"\xC4\xBC" => "l", "\xC4\xBD" => "L", "\xC4\xBE" => "l", "\xC4\xBF" => "L",

105

"\xC5\x80" => "l", "\xC5\x81" => "L", "\xC5\x82" => "l", "\xC5\x83" => "N",

106

"\xC5\x80" => "l", "\xC5\x81" => "L", "\xC5\x82" => "l", "\xC5\x83" => "N",

106

"\xC5\x84" => "n", "\xC5\x85" => "N", "\xC5\x86" => "n", "\xC5\x87" => "N",

107

"\xC5\x84" => "n", "\xC5\x85" => "N", "\xC5\x86" => "n", "\xC5\x87" => "N",

107

"\xC5\x88" => "n", "\xC5\x89" => "n", "\xC5\x8A" => "N", "\xC5\x8B" => "n",

108

"\xC5\x88" => "n", "\xC5\x89" => "n", "\xC5\x8A" => "N", "\xC5\x8B" => "n",

108

"\xC5\x8C" => "O", "\xC5\x8D" => "o", "\xC5\x8E" => "O", "\xC5\x8F" => "o",

109

"\xC5\x8C" => "O", "\xC5\x8D" => "o", "\xC5\x8E" => "O", "\xC5\x8F" => "o",

109

"\xC5\x90" => "O", "\xC5\x91" => "o", "\xC5\x92" => "CE","\xC5\x93" => "ce",

110

"\xC5\x90" => "O", "\xC5\x91" => "o", "\xC5\x92" => "CE","\xC5\x93" => "ce",

110

"\xC5\x94" => "R", "\xC5\x95" => "r", "\xC5\x96" => "R", "\xC5\x97" => "r",

111

"\xC5\x94" => "R", "\xC5\x95" => "r", "\xC5\x96" => "R", "\xC5\x97" => "r",

111

"\xC5\x98" => "R", "\xC5\x99" => "r", "\xC5\x9A" => "S", "\xC5\x9B" => "s",

112

"\xC5\x98" => "R", "\xC5\x99" => "r", "\xC5\x9A" => "S", "\xC5\x9B" => "s",

112

"\xC5\x9C" => "S", "\xC5\x9D" => "s", "\xC5\x9E" => "S", "\xC5\x9F" => "s",

113

"\xC5\x9C" => "S", "\xC5\x9D" => "s", "\xC5\x9E" => "S", "\xC5\x9F" => "s",

113

"\xC5\xA0" => "S", "\xC5\xA1" => "s", "\xC5\xA2" => "T", "\xC5\xA3" => "t",

114

"\xC5\xA0" => "S", "\xC5\xA1" => "s", "\xC5\xA2" => "T", "\xC5\xA3" => "t",

114

"\xC5\xA4" => "T", "\xC5\xA5" => "t", "\xC5\xA6" => "T", "\xC5\xA7" => "t",

115

"\xC5\xA4" => "T", "\xC5\xA5" => "t", "\xC5\xA6" => "T", "\xC5\xA7" => "t",

115

"\xC5\xA8" => "U", "\xC5\xA9" => "u", "\xC5\xAA" => "U", "\xC5\xAB" => "u",

116

"\xC5\xA8" => "U", "\xC5\xA9" => "u", "\xC5\xAA" => "U", "\xC5\xAB" => "u",

116

"\xC5\xAC" => "U", "\xC5\xAD" => "u", "\xC5\xAE" => "U", "\xC5\xAF" => "u",

117

"\xC5\xAC" => "U", "\xC5\xAD" => "u", "\xC5\xAE" => "U", "\xC5\xAF" => "u",

117

"\xC5\xB0" => "U", "\xC5\xB1" => "u", "\xC5\xB2" => "U", "\xC5\xB3" => "u",

118

"\xC5\xB0" => "U", "\xC5\xB1" => "u", "\xC5\xB2" => "U", "\xC5\xB3" => "u",

118

"\xC5\xB4" => "W", "\xC5\xB5" => "w", "\xC5\xB6" => "Y", "\xC5\xB7" => "y",

119

"\xC5\xB4" => "W", "\xC5\xB5" => "w", "\xC5\xB6" => "Y", "\xC5\xB7" => "y",

119

"\xC5\xB8" => "Y", "\xC5\xB9" => "Z", "\xC5\xBA" => "z", "\xC5\xBB" => "Z",

120

"\xC5\xB8" => "Y", "\xC5\xB9" => "Z", "\xC5\xBA" => "z", "\xC5\xBB" => "Z",

120

"\xC5\xBC" => "z", "\xC5\xBD" => "Z", "\xC5\xBE" => "z", "\xC6\x8F" => "E",

121

"\xC5\xBC" => "z", "\xC5\xBD" => "Z", "\xC5\xBE" => "z", "\xC6\x8F" => "E",

121

"\xC6\xA0" => "O", "\xC6\xA1" => "o", "\xC6\xAF" => "U", "\xC6\xB0" => "u",

122

"\xC6\xA0" => "O", "\xC6\xA1" => "o", "\xC6\xAF" => "U", "\xC6\xB0" => "u",

122

"\xC7\x8D" => "A", "\xC7\x8E" => "a", "\xC7\x8F" => "I",

123

"\xC7\x8D" => "A", "\xC7\x8E" => "a", "\xC7\x8F" => "I",

123

"\xC7\x90" => "i", "\xC7\x91" => "O", "\xC7\x92" => "o", "\xC7\x93" => "U",

124

"\xC7\x90" => "i", "\xC7\x91" => "O", "\xC7\x92" => "o", "\xC7\x93" => "U",

124

"\xC7\x94" => "u", "\xC7\x95" => "U", "\xC7\x96" => "u", "\xC7\x97" => "U",

125

"\xC7\x94" => "u", "\xC7\x95" => "U", "\xC7\x96" => "u", "\xC7\x97" => "U",

125

"\xC7\x98" => "u", "\xC7\x99" => "U", "\xC7\x9A" => "u", "\xC7\x9B" => "U",

126

"\xC7\x98" => "u", "\xC7\x99" => "U", "\xC7\x9A" => "u", "\xC7\x9B" => "U",

126

"\xC7\x9C" => "u",

127

"\xC7\x9C" => "u",

127

"\xC7\xBA" => "A", "\xC7\xBB" => "a", "\xC7\xBC" => "AE","\xC7\xBD" => "ae",

128

"\xC7\xBA" => "A", "\xC7\xBB" => "a", "\xC7\xBC" => "AE","\xC7\xBD" => "ae",

128

"\xC7\xBE" => "O", "\xC7\xBF" => "o",

129

"\xC7\xBE" => "O", "\xC7\xBF" => "o",

129

"\xC9\x99" => "e",

130

"\xC9\x99" => "e",

130

"\xC2\x82" => ",", # High code comma

131

"\xC2\x82" => ",", # High code comma

131

"\xC2\x84" => ",,", # High code double comma

132

"\xC2\x84" => ",,", # High code double comma

132

"\xC2\x85" => "...", # Tripple dot

133

"\xC2\x85" => "...", # Tripple dot

133

"\xC2\x88" => "^", # High carat

134

"\xC2\x88" => "^", # High carat

134

"\xC2\x91" => "\x27", # Forward single quote

135

"\xC2\x91" => "\x27", # Forward single quote

135

"\xC2\x92" => "\x27", # Reverse single quote

136

"\xC2\x92" => "\x27", # Reverse single quote

136

"\xC2\x93" => "\x22", # Forward double quote

137

"\xC2\x93" => "\x22", # Forward double quote

137

"\xC2\x94" => "\x22", # Reverse double quote

138

"\xC2\x94" => "\x22", # Reverse double quote

138

"\xC2\x96" => "-", # High hyphen

139

"\xC2\x96" => "-", # High hyphen

139

"\xC2\x97" => "--", # Double hyphen

140

"\xC2\x97" => "--", # Double hyphen

140

"\xC2\xA6" => "|", # Split vertical bar

141

"\xC2\xA6" => "|", # Split vertical bar

141

"\xC2\xAB" => "<<", # Double less than

142

"\xC2\xAB" => "<<", # Double less than

142

"\xC2\xBB" => ">>", # Double greater than

143

"\xC2\xBB" => ">>", # Double greater than

143

"\xC2\xBC" => "1/4", # one quarter

144

"\xC2\xBC" => "1/4", # one quarter

144

"\xC2\xBD" => "1/2", # one half

145

"\xC2\xBD" => "1/2", # one half

145

"\xC2\xBE" => "3/4", # three quarters

146

"\xC2\xBE" => "3/4", # three quarters

146

"\xCA\xBF" => "\x27", # c-single quote

147

"\xCA\xBF" => "\x27", # c-single quote

147

"\xCC\xA8" => "", # modifier - under curve

148

"\xCC\xA8" => "", # modifier - under curve

148

"\xCC\xB1" => "", # modifier - under line

149

"\xCC\xB1" => "", # modifier - under line

149

# /\W/ => ""

150

# /\W/ => ""

150

}

151

}

151

152

end

153

end

153

154

# Extention of the standard class String with useful function.

155

# Extention of the standard class String with useful function.

155

class String

156

class String

156

include Mirimiri

157

include Mirimiri

157

158

def unaccent

159

def unaccent

159

# force_encoding is needed with ruby1.9

160

# force_encoding is needed with ruby1.9

160

Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) }

161

Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) }

161

end

162

end

162

163

# Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise.

164

# Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise.

164

def is_stopword?

165

def is_stopword?

165

Stoplist.include?(self.downcase)

166

Stoplist.include?(self.downcase)

166

end

167

end

167

168

# Do not use.

169

# Do not use.

169

# TODO: rewamp. find why this function is here.

170

# TODO: rewamp. find why this function is here.

170

def remove_special_characters

171

def remove_special_characters

171

self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ')

172

self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ')

172

end

173

end

173

174

# Removes all XML-like tags from +self+.

175

# Removes all XML-like tags from +self+.

175

#

176

#

176

# s = "<html><body>test</body></html>"

177

# s = "<html><body>test</body></html>"

177

# s.strip_xml_tags!

178

# s.strip_xml_tags!

178

# s #=> "test"

179

# s #=> "test"

179

def strip_xml_tags!

180

def strip_xml_tags!

180

replace strip_with_pattern /<\/?[^>]*>/

181

replace strip_with_pattern /<\/?[^>]*>/

181

end

182

end

182

183

# Removes all XML-like tags from +self+.

184

# Removes all XML-like tags from +self+.

184

#

185

#

185

# s = "<html><body>test</body></html>"

186

# s = "<html><body>test</body></html>"

186

# s.strip_xml_tags #=> "test"

187

# s.strip_xml_tags #=> "test"

187

# s #=> "<html><body>test</body></html>"

188

# s #=> "<html><body>test</body></html>"

188

def strip_xml_tags

189

def strip_xml_tags

189

dup.strip_xml_tags!

190

dup.strip_xml_tags!

190

end

191

end

191

192

# Removes all Javascript sources from +self+.

193

# Removes all Javascript sources from +self+.

193

#

194

#

194

# s = "<script type='text/javascript'>

195

# s = "<script type='text/javascript'>

195

# var skin='vector',

196

# var skin='vector',

196

# stylepath='http://bits.wikimedia.org/skins-1.5'

197

# stylepath='http://bits.wikimedia.org/skins-1.5'

197

# </script>

198

# </script>

198

#

199

#

199

# test"

200

# test"

200

# s.strip_javascripts!

201

# s.strip_javascripts!

201

# s #=> "test"

202

# s #=> "test"

202

def strip_javascripts!

203

def strip_javascripts!

203

replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m

204

replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m

204

end

205

end

205

206

# Removes all Javascript sources from +self+.

207

# Removes all Javascript sources from +self+.

207

#

208

#

208

# s = "<script type='text/javascript'>

209

# s = "<script type='text/javascript'>

209

# var skin='vector',

210

# var skin='vector',

210

# stylepath='http://bits.wikimedia.org/skins-1.5'

211

# stylepath='http://bits.wikimedia.org/skins-1.5'

211

# </script>

212

# </script>

212

#

213

#

213

# test"

214

# test"

214

# s.strip_javascripts #=> "test"

215

# s.strip_javascripts #=> "test"

215

def strip_javascripts

216

def strip_javascripts

216

dup.strip_javascripts!

217

dup.strip_javascripts!

217

end

218

end

218

219

def strip_stylesheets!

220

def strip_stylesheets!

220

# TODO: rewamp. dunno what is it.

221

# TODO: rewamp. dunno what is it.

221

replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m

222

replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m

222

end

223

end

223

224

def strip_stylesheets

225

def strip_stylesheets

225

dup.strip_stylesheets!

226

dup.strip_stylesheets!

226

end

227

end

227

228

# Removes punctuation from +self+.

229

# Removes punctuation from +self+.

229

#

230

#

230

# s = "hello, world. how are you?!"

231

# s = "hello, world. how are you?!"

231

# s.strip_punctuation!

232

# s.strip_punctuation!

232

# s # => "hello world how are you"

233

# s # => "hello world how are you"

233

def strip_punctuation!

234

def strip_punctuation!

234

replace strip_with_pattern /[^a-zA-Z0-9\-\s]/

235

replace strip_with_pattern /[^a-zA-Z0-9\-\s]/

235

end

236

end

236

237

# Removes punctuation from +self+.

238

# Removes punctuation from +self+.

238

#

239

#

239

# s = "hello, world. how are you?!"

240

# s = "hello, world. how are you?!"

240

# s.strip_punctuation # => "hello world how are you"

241

# s.strip_punctuation # => "hello world how are you"

241

def strip_punctuation

242

def strip_punctuation

242

dup.strip_punctuation!

243

dup.strip_punctuation!

243

end

244

end

244

245

# Returns the text values inside all occurences of a XML tag in +self+

246

# Returns the text values inside all occurences of a XML tag in +self+

246

#

247

#

247

# s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre"

248

# s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre"

248

# s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"]

249

# s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"]

249

def extract_xmltags_values(tag_name)

250

def extract_xmltags_values(tag_name)

250

self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten

251

self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten

251

end

252

end

252

253

def strip_with_pattern(pattern)

254

def strip_with_pattern(pattern)

254

require 'cgi'

255

require 'cgi'

255

256

CGI::unescapeHTML(self.gsub(pattern,"")).unaccent.encode("UTF-8", {:invalid => :replace, :undef => :replace, :replace => " "})

257

CGI::unescapeHTML(self.gsub(pattern,"")).unaccent.encode("UTF-8", {:invalid => :replace, :undef => :replace, :replace => " "})

257

end

258

end

258

259

private :strip_with_pattern

260

private :strip_with_pattern

260

end

261

end

261

262

GITLAB

Deveaud Romain / mirimiri

bugfix in document, more stopwords

 #!/usr/bin/env ruby
 #--
 # This file is a part of the mirimiri library
 #
 # Copyright (C) 2010-2011 Romain Deveaud <romain.deveaud@gmail.com>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 #++
 module Mirimiri
   # These are the default stopwords provided by Lemur.
   Stoplist = [
 "a","about","above","according","across","after","afterwards","again","against",
 "albeit","all","almost","alone","along","already","also","although","always","am",
 "among","amongst","an","and","another","any","anybody","anyhow","anyone","anything",
 "anyway","anywhere","apart","are","around","as","at","av","be","became","because",
 "become","becomes","becoming","been","before","beforehand","behind","being","below",
 "beside","besides","between","beyond","both","but","by","can","cannot","canst",
 "certain","cf","choose","contrariwise","cos","could","cu","day","do","does","doesn't",
 "doing","dost","doth","double","down","dual","during","each","either","else",
 "elsewhere","enough","et","etc","even","ever","every","everybody","everyone",
 "everything","everywhere","except","excepted","excepting","exception","exclude",
 "excluding","exclusive","far","farther","farthest","few","ff","first","for",
 "formerly","forth","forward","from","front","further","furthermore","furthest","get",
 "go","had","halves","hardly","has","hast","hath","have","he","hence","henceforth",
 "her","here","hereabouts","hereafter","hereby","herein","hereto","hereupon","hers",
 "herself","him","himself","hindmost","his","hither","hitherto","how","however",
 "howsoever","i","ie","if","in","inasmuch","inc","include","included","including",
 "indeed","indoors","inside","insomuch","instead","into","inward","inwards","is",
 "it","its","itself","just","kind","kg","km","last","latter","latterly","less","lest",
 "let","like","little","ltd","many","may","maybe","me","meantime","meanwhile","might",
 "moreover","most","mostly","more","mr","mrs","ms","much","must","my","myself",
 "namely","need","neither","never","nevertheless","next","no","nobody","none",
 "nonetheless","noone","nope","nor","not","nothing","notwithstanding","now","nowadays",
 "nowhere","of","off","often","ok","on","once","one","only","onto","or","other",
 "others","otherwise","ought","our","ours","ourselves","out","outside","over","own",
 "per","perhaps","plenty","provide","quite","rather","really","round","said","sake",
 "same","sang","save","saw","see","seeing","seem","seemed","seeming","seems","seen",
 "seldom","selves","sent","several","shalt","she","should","shown","sideways","since",
 "slept","slew","slung","slunk","smote","so","some","somebody","somehow","someone",
 "something","sometime","sometimes","somewhat","somewhere","spake","spat","spoke",
 "spoken","sprang","sprung","stave","staves","still","such","supposing","than","that",
 "the","thee","their","them","themselves","then","thence","thenceforth","there",
 "thereabout","thereabouts","thereafter","thereby","therefore","therein","thereof",
 "thereon","thereto","thereupon","these","they","this","those","thou","though",
 "thrice","through","throughout","thru","thus","thy","thyself","till","to","together",
 "too","toward","towards","ugh","unable","under","underneath","unless","unlike",
 "until","up","upon","upward","upwards","us","use","used","using","very","via","vs",
 "want","was","we","week","well","were","what","whatever","whatsoever","when","whence",
 "whenever","whensoever","where","whereabouts","whereafter","whereas","whereat",
 "whereby","wherefore","wherefrom","wherein","whereinto","whereof","whereon",
 "wheresoever","whereto","whereunto","whereupon","wherever","wherewith","whether",
 "whew","which","whichever","whichsoever","while","whilst","whither","who","whoa",
 "whoever","whole","whom","whomever","whomsoever","whose","whosoever","why","will",
 "wilt","with","within","without","worse","worst","would","wow","ye","yet","year",
-"yippee","you","your","yours","yourself","yourselves"
+"yippee","you","your","yours","yourself","yourselves",
+  "edit", "new", "page", "article", "http", "www", "com", "org", "wikipedia", "en"
   ]
   Transmap = {
   "\xC3\x80" => "A", "\xC3\x81" => "A", "\xC3\x82" => "A", "\xC3\x83" => "A",
   "\xC3\x84" => "A", "\xC3\x85" => "A", "\xC3\x86" => "AE","\xC3\x87" => "C",
   "\xC3\x88" => "E", "\xC3\x89" => "E", "\xC3\x8A" => "E", "\xC3\x8B" => "E",
   "\xC3\x8C" => "I", "\xC3\x8D" => "I", "\xC3\x8E" => "I", "\xC3\x8F" => "I",
   "\xC3\x90" => "D", "\xC3\x91" => "N", "\xC3\x92" => "O", "\xC3\x93" => "O",
   "\xC3\x94" => "O", "\xC3\x95" => "O", "\xC3\x96" => "O", "\xC3\x98" => "O",
   "\xC3\x99" => "U", "\xC3\x9A" => "U", "\xC3\x9B" => "U", "\xC3\x9C" => "U",
   "\xC3\x9D" => "Y", "\xC3\x9E" => "P", "\xC3\x9F" => "ss",
   "\xC3\xA0" => "a", "\xC3\xA1" => "a", "\xC3\xA2" => "a", "\xC3\xA3" => "a",
   "\xC3\xA4" => "a", "\xC3\xA5" => "a", "\xC3\xA6" => "ae","\xC3\xA7" => "c",
   "\xC3\xA8" => "e", "\xC3\xA9" => "e", "\xC3\xAA" => "e", "\xC3\xAB" => "e",
   "\xC3\xAC" => "i", "\xC3\xAD" => "i", "\xC3\xAE" => "i", "\xC3\xAF" => "i",
   "\xC3\xB0" => "o", "\xC3\xB1" => "n", "\xC3\xB2" => "o", "\xC3\xB3" => "o",
   "\xC3\xB4" => "o", "\xC3\xB5" => "o", "\xC3\xB6" => "o", "\xC3\xB8" => "o",
   "\xC3\xB9" => "u", "\xC3\xBA" => "u", "\xC3\xBB" => "u", "\xC3\xBC" => "u",
   "\xC3\xBD" => "y", "\xC3\xBE" => "p", "\xC3\xBF" => "y",
   "\xC4\x80" => "A", "\xC4\x81" => "a", "\xC4\x82" => "A", "\xC4\x83" => "a",
   "\xC4\x84" => "A", "\xC4\x85" => "a", "\xC4\x86" => "C", "\xC4\x87" => "c",
   "\xC4\x88" => "C", "\xC4\x89" => "c", "\xC4\x8A" => "C", "\xC4\x8B" => "c",
   "\xC4\x8C" => "C", "\xC4\x8D" => "c", "\xC4\x8E" => "D", "\xC4\x8F" => "d",
   "\xC4\x90" => "D", "\xC4\x91" => "d", "\xC4\x92" => "E", "\xC4\x93" => "e",
   "\xC4\x94" => "E", "\xC4\x95" => "e", "\xC4\x96" => "E", "\xC4\x97" => "e",
   "\xC4\x98" => "E", "\xC4\x99" => "e", "\xC4\x9A" => "E", "\xC4\x9B" => "e",
   "\xC4\x9C" => "G", "\xC4\x9D" => "g", "\xC4\x9E" => "G", "\xC4\x9F" => "g",
   "\xC4\xA0" => "G", "\xC4\xA1" => "g", "\xC4\xA2" => "G", "\xC4\xA3" => "g",
   "\xC4\xA4" => "H", "\xC4\xA5" => "h", "\xC4\xA6" => "H", "\xC4\xA7" => "h",
   "\xC4\xA8" => "I", "\xC4\xA9" => "i", "\xC4\xAA" => "I", "\xC4\xAB" => "i",
   "\xC4\xAC" => "I", "\xC4\xAD" => "i", "\xC4\xAE" => "I", "\xC4\xAF" => "i",
   "\xC4\xB0" => "I", "\xC4\xB1" => "i", "\xC4\xB2" => "IJ","\xC4\xB3" => "ij",
   "\xC4\xB4" => "J", "\xC4\xB5" => "j", "\xC4\xB6" => "K", "\xC4\xB7" => "k",
   "\xC4\xB8" => "k", "\xC4\xB9" => "L", "\xC4\xBA" => "l", "\xC4\xBB" => "L",
   "\xC4\xBC" => "l", "\xC4\xBD" => "L", "\xC4\xBE" => "l", "\xC4\xBF" => "L",
   "\xC5\x80" => "l", "\xC5\x81" => "L", "\xC5\x82" => "l", "\xC5\x83" => "N",
   "\xC5\x84" => "n", "\xC5\x85" => "N", "\xC5\x86" => "n", "\xC5\x87" => "N",
   "\xC5\x88" => "n", "\xC5\x89" => "n", "\xC5\x8A" => "N", "\xC5\x8B" => "n",
   "\xC5\x8C" => "O", "\xC5\x8D" => "o", "\xC5\x8E" => "O", "\xC5\x8F" => "o",
   "\xC5\x90" => "O", "\xC5\x91" => "o", "\xC5\x92" => "CE","\xC5\x93" => "ce",
   "\xC5\x94" => "R", "\xC5\x95" => "r", "\xC5\x96" => "R", "\xC5\x97" => "r",
   "\xC5\x98" => "R", "\xC5\x99" => "r", "\xC5\x9A" => "S", "\xC5\x9B" => "s",
   "\xC5\x9C" => "S", "\xC5\x9D" => "s", "\xC5\x9E" => "S", "\xC5\x9F" => "s",
   "\xC5\xA0" => "S", "\xC5\xA1" => "s", "\xC5\xA2" => "T", "\xC5\xA3" => "t",
   "\xC5\xA4" => "T", "\xC5\xA5" => "t", "\xC5\xA6" => "T", "\xC5\xA7" => "t",
   "\xC5\xA8" => "U", "\xC5\xA9" => "u", "\xC5\xAA" => "U", "\xC5\xAB" => "u",
   "\xC5\xAC" => "U", "\xC5\xAD" => "u", "\xC5\xAE" => "U", "\xC5\xAF" => "u",
   "\xC5\xB0" => "U", "\xC5\xB1" => "u", "\xC5\xB2" => "U", "\xC5\xB3" => "u",
   "\xC5\xB4" => "W", "\xC5\xB5" => "w", "\xC5\xB6" => "Y", "\xC5\xB7" => "y",
   "\xC5\xB8" => "Y", "\xC5\xB9" => "Z", "\xC5\xBA" => "z", "\xC5\xBB" => "Z",
   "\xC5\xBC" => "z", "\xC5\xBD" => "Z", "\xC5\xBE" => "z", "\xC6\x8F" => "E",
   "\xC6\xA0" => "O", "\xC6\xA1" => "o", "\xC6\xAF" => "U", "\xC6\xB0" => "u",
   "\xC7\x8D" => "A", "\xC7\x8E" => "a", "\xC7\x8F" => "I",
   "\xC7\x90" => "i", "\xC7\x91" => "O", "\xC7\x92" => "o", "\xC7\x93" => "U",
   "\xC7\x94" => "u", "\xC7\x95" => "U", "\xC7\x96" => "u", "\xC7\x97" => "U",
   "\xC7\x98" => "u", "\xC7\x99" => "U", "\xC7\x9A" => "u", "\xC7\x9B" => "U",
   "\xC7\x9C" => "u",
   "\xC7\xBA" => "A", "\xC7\xBB" => "a", "\xC7\xBC" => "AE","\xC7\xBD" => "ae",
   "\xC7\xBE" => "O", "\xC7\xBF" => "o",
   "\xC9\x99" => "e",
   "\xC2\x82" => ",",        # High code comma
   "\xC2\x84" => ",,",       # High code double comma
   "\xC2\x85" => "...",      # Tripple dot
   "\xC2\x88" => "^",        # High carat
   "\xC2\x91" => "\x27",     # Forward single quote
   "\xC2\x92" => "\x27",     # Reverse single quote
   "\xC2\x93" => "\x22",     # Forward double quote
   "\xC2\x94" => "\x22",     # Reverse double quote
   "\xC2\x96" => "-",        # High hyphen
   "\xC2\x97" => "--",       # Double hyphen
   "\xC2\xA6" => "|",        # Split vertical bar
   "\xC2\xAB" => "<<",       # Double less than
   "\xC2\xBB" => ">>",       # Double greater than
   "\xC2\xBC" => "1/4",      # one quarter
   "\xC2\xBD" => "1/2",      # one half
   "\xC2\xBE" => "3/4",      # three quarters
   "\xCA\xBF" => "\x27",     # c-single quote
   "\xCC\xA8" => "",         # modifier - under curve
   "\xCC\xB1" => "",         # modifier - under line
 #  /\W/ => ""
   }
 end
 # Extention of the standard class String with useful function.
 class String
   include Mirimiri
   def unaccent
     # force_encoding is needed with ruby1.9
     Transmap.inject(self.force_encoding("ASCII-8BIT")) { |str, (utf8, asc)| str.gsub(utf8, asc) }
   end
   # Returns +true+ if +self+ belongs to Rir::Stoplist, +false+ otherwise.
   def is_stopword?
     Stoplist.include?(self.downcase)
   end
   # Do not use.
   # TODO: rewamp. find why this function is here.
   def remove_special_characters
     self.split.collect { |w| w.gsub(/\W/,' ').split.collect { |w| w.gsub(/\W/,' ').strip.sub(/\A.\z/, '')}.join(' ').strip.sub(/\A.\z/, '')}.join(' ')
   end
   # Removes all XML-like tags from +self+.
   #
   #   s = "<html><body>test</body></html>"
   #   s.strip_xml_tags!
   #   s                                     #=> "test"
   def strip_xml_tags!
     replace strip_with_pattern /<\/?[^>]*>/
   end
   # Removes all XML-like tags from +self+.
   #
   #   s = "<html><body>test</body></html>"
   #   s.strip_xml_tags                      #=> "test"
   #   s                                     #=> "<html><body>test</body></html>"
   def strip_xml_tags
     dup.strip_xml_tags!
   end
   # Removes all Javascript sources from +self+.
   #
   #   s = "<script type='text/javascript'>
   #         var skin='vector',
   #         stylepath='http://bits.wikimedia.org/skins-1.5'
   #        </script>
   #
   #        test"
   #   s.strip_javascripts!
   #   s                                     #=> "test"
   def strip_javascripts!
     replace strip_with_pattern /<script type="text\/javascript">(.+?)<\/script>/m
   end
   # Removes all Javascript sources from +self+.
   #
   #   s = "<script type='text/javascript'>
   #         var skin='vector',
   #         stylepath='http://bits.wikimedia.org/skins-1.5'
   #        </script>
   #
   #        test"
   #   s.strip_javascripts                   #=> "test"
   def strip_javascripts
     dup.strip_javascripts!
   end
   def strip_stylesheets!
   # TODO: rewamp. dunno what is it.
     replace strip_with_pattern /<style type="text\/css">(.+?)<\/style>/m
   end
   def strip_stylesheets
     dup.strip_stylesheets!
   end
   # Removes punctuation from +self+.
   #
   #   s = "hello, world. how are you?!"
   #   s.strip_punctuation!
   #   s                                 # => "hello world how are you"
   def strip_punctuation!
     replace strip_with_pattern /[^a-zA-Z0-9\-\s]/
   end
   # Removes punctuation from +self+.
   #
   #   s = "hello, world. how are you?!"
   #   s.strip_punctuation               # => "hello world how are you"
   def strip_punctuation
     dup.strip_punctuation!
   end
   # Returns the text values inside all occurences of a XML tag in +self+
   #
   #   s = "four-piece in <a href='#'>Indianapolis</a>, <a href='#'>Indiana</a> at the Murat Theatre"
   #   s.extract_xmltags_values 'a' #=> ["Indianapolis", "Indiana"]
   def extract_xmltags_values(tag_name)
     self.scan(/<#{tag_name}.*?>(.+?)<\/#{tag_name}>/).flatten
   end
   def strip_with_pattern(pattern)
     require 'cgi'
     CGI::unescapeHTML(self.gsub(pattern,"")).unaccent.encode("UTF-8", {:invalid => :replace, :undef => :replace, :replace => " "})
   end
   private :strip_with_pattern
 end