# ======== PREPROCESS REGEX =========== # # Author: # ------ # Emmanuel FERREIRA # # Warning : # -------- # this regex are first writted to process # Bdlex formatted data # # How can use it ? # ---------------- # You can comment, un-comment, correct # # ======================================= # to process already cleaned corpus s/<\/s> /\./g s///g s/<\/s>/\./g # remove all characters between [ ] and [ ] s/\[[^\[]*\]/ /g # remove all characters between ( ) and ( ) s/([^)]*)/ /g # remove all characters between < > and < > s/<[^<]*>/ /g # remove url (TODO convert url to txt) s/\bhttp:[A-Za-z0-9\/_-\.=&?]\+//g # preformat roman value XVe => XV e / Ier => I er #s/\bIer\b/ I er /g #s/\bIere\b/ I e2re /g #s/\bIe2re\b/ I e2re /g #s/\b\([IVX]\+\)e\b/ \1 e /g # remove ponctuation _ \ * - #" s/_\|\*\|\\\||\|-\|"\|#/ /g # replace j'etais => j' etais s/\([A-Za-z0-9]\)'\([A-Za-z0-9]\)/\1' \2/g # replace 08:32 par 08h32 s/\(\b[0-9]\{2\}\):\([0-9]\{2\}\b\)/\1h\2/g # replace : by . s/:/\./g # replace ! by . s/\!/\./g # replace ? by . s/?/\./g # replace break line sequence by " " s/\n+/ /g # replace . sequence by . s/\.\+/\./g # 2e2me => 2 e2me & 1e2re => 1 e2me s/\b1e2re\b/ 1 e2re /g s/\b\([0-9]\+\)e2me\b/ \1 e2me /g # hours s/\b00\(H\|h\)00\b/ minuit /g s/\b00\(H\|h\)/ minuit /g s/\b12\(H\|h\)00\b/ midi /g s/\b12\(H\|h\)/ midi /g s/\b0\([1-9]\)\(H\|h\)/ \1H/g # 1/2 => 1 demi s/\([0-9]\)\/2\b/\1 demi /g # 1/4 => 4 quart s/\([0-9]\)\/4\b/\1 quart /g # 1/16 => 1 16 e s/\b\([0-9]\)\/\([0-9][0-9]*\)\b/ \1 \2 e /g # abbreviation s/\./\. /g s/\b[Mm]r\. / mister /g s/\b[Mm]me\. / madame /g s/\b[Mm]lle\. / mademoiselle /g s/\b[Ll]t\. / lieutenant /g s/\bM\. / monsieur /g s/\b[Mm]rs\. / Messieurs /g s/\bMgr\. / Monseigneur/g s/\bDr\. / docteur /g s/\bPr\. / professeur /g s/\bPrs\. / professeurs /g s/\bJ\. \?C\. / Je1sus Christ /g s/\bSt\.\? / saint /g s/\bSte\.\? / sainte /g s/\bapr\?\. J\. \?C\. / apre2s Je1sus Christ /g s/\bav\. J\. \?C\. / avant Je1sus Christ /g # => (like avr. => avr) s/\b[aA][vp]r\. / avril /g s/\b[aA]ug\. / aou3t /g s/\b[dD]\(e\|e1\)c\. / de1cembre /g s/\b[fF]e[bv]\. / fe1vrier /g s/\b[jJ]an\. / janvier /g s/\b[jJ]ul\. / juillet /g s/\b[jJ]un\. / juin /g s/\b[mM]ar\. / mars /g s/\b[nN]ov\. / novembre /g s/\b[oO]ct\. / octobre /g s/\b[sS]ept\. / septembre /g #=> s/\b[Ss]te1\. / socie1te1 /g s/\b[Nn]um\. / nume1ro /g s/\b[Tt]\(e1\|e\)l\. / te1le1phone /g s/\bart\. / article /g s/\bann\. / annexe /g s/\bappt\. / appartement /g s/\bapr\. / apre2s /g s/\bav\. / avant /g s/\bba3t\. / ba3timent /g s/\bbd\. / boulevard /g s/\bboul\. / boulevard /g s/\bBd\. / boulevard /g s/\bchap\. / chapitre /g s/\b[Cc]ie\. / compagnie /g s/\bdir\. / directeur /g s/\be2d\. / e2dition /g s/\be2ds\. / e2ditions /g s/\be\. \?g\. / par exemple /g s/\bet al\. / et autres /g s/\betc\. / et caetera /g s/\bfe1m\. / fe1minin /g s/\bfig\. / figure /g s/\bi\. \?e\. / c' est a2 dire /g s/\bmasc\. / masculin/g s/\bN\.D\. / notre Dame /g s/\bp\.\(\d+\)/ page \1/g s/\bvs\. / versus /g s/\bc\.\? \?a\.\? \?d\.\? / c' est a2 dire /g # 10% => 10 pour cent s/\([0-9]\)%/\1 pour cent /g # +1 => plus 1 s/+\([0-9]\)/ plus \1/g s/x\([0-9]\)/ fois \1/g