preprocess.regex 3.11 KB
# ======== PREPROCESS REGEX ===========
#
# Author:
# ------
# Emmanuel FERREIRA
#
# Warning : 
# --------
# this regex are first writted to process
# Bdlex formatted data 
#
# How can use it ?
# ----------------
# You can comment, un-comment, correct  
#
# =======================================
# to process already cleaned corpus
s/<\/s> <s>/\./g
s/<s>//g
s/<\/s>/\./g
# remove all characters between [ ] and [ ]
s/\[[^\[]*\]/ /g
# remove all characters between ( ) and ( )
s/([^)]*)/ /g
# remove all characters between < > and < >
s/<[^<]*>/ /g
# remove url (TODO convert url to txt)
s/\bhttp:[A-Za-z0-9\/_-\.=&?]\+//g
# preformat roman value XVe => XV e / Ier => I er
#s/\bIer\b/ I er /g
#s/\bIere\b/ I e2re /g
#s/\bIe2re\b/ I e2re /g
#s/\b\([IVX]\+\)e\b/ \1 e /g
# remove ponctuation _ \ * - #"
s/_\|\*\|\\\||\|-\|"\|#/ /g
# replace j'etais => j' etais
s/\([A-Za-z0-9]\)'\([A-Za-z0-9]\)/\1' \2/g
# replace 08:32 par 08h32
s/\(\b[0-9]\{2\}\):\([0-9]\{2\}\b\)/\1h\2/g
# replace : by .
s/:/\./g
# replace ! by .
s/\!/\./g
# replace ? by .
s/?/\./g
# replace break line sequence by " "
s/\n+/ /g
# replace . sequence by .
s/\.\+/\./g
# 2e2me => 2 e2me & 1e2re => 1 e2me
s/\b1e2re\b/ 1 e2re /g
s/\b\([0-9]\+\)e2me\b/ \1 e2me /g
# hours
s/\b00\(H\|h\)00\b/ minuit /g
s/\b00\(H\|h\)/ minuit /g
s/\b12\(H\|h\)00\b/ midi /g
s/\b12\(H\|h\)/ midi /g
s/\b0\([1-9]\)\(H\|h\)/ \1H/g
# 1/2 => 1 demi
s/\([0-9]\)\/2\b/\1 demi /g
# 1/4 => 4 quart
s/\([0-9]\)\/4\b/\1 quart /g
# 1/16 => 1 16 e
s/\b\([0-9]\)\/\([0-9][0-9]*\)\b/ \1 \2 e /g
# abbreviation
s/\./\. /g
s/\b[Mm]r\. / mister /g
s/\b[Mm]me\. / madame /g
s/\b[Mm]lle\. / mademoiselle /g
s/\b[Ll]t\. / lieutenant /g
s/\bM\. / monsieur /g
s/\b[Mm]rs\. / Messieurs /g
s/\bMgr\. / Monseigneur/g
s/\bDr\. / docteur /g
s/\bPr\. / professeur /g
s/\bPrs\. / professeurs /g
s/\bJ\. \?C\. / Je1sus Christ /g
s/\bSt\.\? / saint /g
s/\bSte\.\? / sainte /g
s/\bapr\?\. J\. \?C\. / apre2s Je1sus Christ /g
s/\bav\. J\. \?C\. / avant Je1sus Christ /g
# =><Months> (like avr. => avr)
s/\b[aA][vp]r\. / avril /g
s/\b[aA]ug\. / aou3t /g
s/\b[dD]\(e\|e1\)c\. / de1cembre /g
s/\b[fF]e[bv]\. / fe1vrier /g
s/\b[jJ]an\. / janvier /g
s/\b[jJ]ul\. / juillet /g
s/\b[jJ]un\. / juin /g
s/\b[mM]ar\. / mars /g
s/\b[nN]ov\. / novembre /g
s/\b[oO]ct\. / octobre /g
s/\b[sS]ept\. / septembre /g
#=></Months>
s/\b[Ss]te1\. / socie1te1 /g
s/\b[Nn]um\. / nume1ro /g
s/\b[Tt]\(e1\|e\)l\. / te1le1phone /g
s/\bart\. / article /g
s/\bann\. / annexe /g
s/\bappt\. / appartement /g
s/\bapr\. / apre2s /g
s/\bav\. / avant /g
s/\bba3t\. / ba3timent /g
s/\bbd\. / boulevard /g
s/\bboul\. / boulevard /g
s/\bBd\. / boulevard /g
s/\bchap\. / chapitre /g
s/\b[Cc]ie\. / compagnie /g
s/\bdir\. / directeur /g
s/\be2d\. / e2dition /g
s/\be2ds\. / e2ditions /g
s/\be\. \?g\. / par exemple /g
s/\bet al\. / et autres /g
s/\betc\. / et caetera /g
s/\bfe1m\. / fe1minin /g
s/\bfig\. / figure /g
s/\bi\. \?e\. / c' est a2 dire /g
s/\bmasc\. / masculin/g
s/\bN\.D\. / notre Dame /g
s/\bp\.\(\d+\)/ page \1/g
s/\bvs\. / versus /g
s/\bc\.\? \?a\.\? \?d\.\? / c' est a2 dire /g
# 10% => 10 pour cent
s/\([0-9]\)%/\1 pour cent /g
# +1 => plus 1
s/+\([0-9]\)/ plus \1/g
s/x\([0-9]\)/ fois \1/g