Blame view

data/rules/preprocess.regex 3.11 KB
e6be5137b   Jean-François Rey   reinitialized pro...
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
  # ======== PREPROCESS REGEX ===========
  #
  # Author:
  # ------
  # Emmanuel FERREIRA
  #
  # Warning : 
  # --------
  # this regex are first writted to process
  # Bdlex formatted data 
  #
  # How can use it ?
  # ----------------
  # You can comment, un-comment, correct  
  #
  # =======================================
  # to process already cleaned corpus
  s/<\/s> <s>/\./g
  s/<s>//g
  s/<\/s>/\./g
  # remove all characters between [ ] and [ ]
  s/\[[^\[]*\]/ /g
  # remove all characters between ( ) and ( )
  s/([^)]*)/ /g
  # remove all characters between < > and < >
  s/<[^<]*>/ /g
  # remove url (TODO convert url to txt)
  s/\bhttp:[A-Za-z0-9\/_-\.=&?]\+//g
  # preformat roman value XVe => XV e / Ier => I er
  #s/\bIer\b/ I er /g
  #s/\bIere\b/ I e2re /g
  #s/\bIe2re\b/ I e2re /g
  #s/\b\([IVX]\+\)e\b/ \1 e /g
  # remove ponctuation _ \ * - #"
  s/_\|\*\|\\\||\|-\|"\|#/ /g
  # replace j'etais => j' etais
  s/\([A-Za-z0-9]\)'\([A-Za-z0-9]\)/\1' \2/g
  # replace 08:32 par 08h32
  s/\(\b[0-9]\{2\}\):\([0-9]\{2\}\b\)/\1h\2/g
  # replace : by .
  s/:/\./g
  # replace ! by .
  s/\!/\./g
  # replace ? by .
  s/?/\./g
  # replace break line sequence by " "
  s/
  +/ /g
  # replace . sequence by .
  s/\.\+/\./g
  # 2e2me => 2 e2me & 1e2re => 1 e2me
  s/\b1e2re\b/ 1 e2re /g
  s/\b\([0-9]\+\)e2me\b/ \1 e2me /g
  # hours
  s/\b00\(H\|h\)00\b/ minuit /g
  s/\b00\(H\|h\)/ minuit /g
  s/\b12\(H\|h\)00\b/ midi /g
  s/\b12\(H\|h\)/ midi /g
  s/\b0\([1-9]\)\(H\|h\)/ \1H/g
  # 1/2 => 1 demi
  s/\([0-9]\)\/2\b/\1 demi /g
  # 1/4 => 4 quart
  s/\([0-9]\)\/4\b/\1 quart /g
  # 1/16 => 1 16 e
  s/\b\([0-9]\)\/\([0-9][0-9]*\)\b/ \1 \2 e /g
  # abbreviation
  s/\./\. /g
  s/\b[Mm]r\. / mister /g
  s/\b[Mm]me\. / madame /g
  s/\b[Mm]lle\. / mademoiselle /g
  s/\b[Ll]t\. / lieutenant /g
  s/\bM\. / monsieur /g
  s/\b[Mm]rs\. / Messieurs /g
  s/\bMgr\. / Monseigneur/g
  s/\bDr\. / docteur /g
  s/\bPr\. / professeur /g
  s/\bPrs\. / professeurs /g
  s/\bJ\. \?C\. / Je1sus Christ /g
  s/\bSt\.\? / saint /g
  s/\bSte\.\? / sainte /g
  s/\bapr\?\. J\. \?C\. / apre2s Je1sus Christ /g
  s/\bav\. J\. \?C\. / avant Je1sus Christ /g
  # =><Months> (like avr. => avr)
  s/\b[aA][vp]r\. / avril /g
  s/\b[aA]ug\. / aou3t /g
  s/\b[dD]\(e\|e1\)c\. / de1cembre /g
  s/\b[fF]e[bv]\. / fe1vrier /g
  s/\b[jJ]an\. / janvier /g
  s/\b[jJ]ul\. / juillet /g
  s/\b[jJ]un\. / juin /g
  s/\b[mM]ar\. / mars /g
  s/\b[nN]ov\. / novembre /g
  s/\b[oO]ct\. / octobre /g
  s/\b[sS]ept\. / septembre /g
  #=></Months>
  s/\b[Ss]te1\. / socie1te1 /g
  s/\b[Nn]um\. / nume1ro /g
  s/\b[Tt]\(e1\|e\)l\. / te1le1phone /g
  s/\bart\. / article /g
  s/\bann\. / annexe /g
  s/\bappt\. / appartement /g
  s/\bapr\. / apre2s /g
  s/\bav\. / avant /g
  s/\bba3t\. / ba3timent /g
  s/\bbd\. / boulevard /g
  s/\bboul\. / boulevard /g
  s/\bBd\. / boulevard /g
  s/\bchap\. / chapitre /g
  s/\b[Cc]ie\. / compagnie /g
  s/\bdir\. / directeur /g
  s/\be2d\. / e2dition /g
  s/\be2ds\. / e2ditions /g
  s/\be\. \?g\. / par exemple /g
  s/\bet al\. / et autres /g
  s/\betc\. / et caetera /g
  s/\bfe1m\. / fe1minin /g
  s/\bfig\. / figure /g
  s/\bi\. \?e\. / c' est a2 dire /g
  s/\bmasc\. / masculin/g
  s/\bN\.D\. / notre Dame /g
  s/\bp\.\(\d+\)/ page \1/g
  s/\bvs\. / versus /g
  s/\bc\.\? \?a\.\? \?d\.\? / c' est a2 dire /g
  # 10% => 10 pour cent
  s/\([0-9]\)%/\1 pour cent /g
  # +1 => plus 1
  s/+\([0-9]\)/ plus \1/g
  s/x\([0-9]\)/ fois \1/g