utils.py 1.49 KB
``````# -*- coding: utf-8 -*-
import nltk
import re
pattern =  ur"\d+(?:\.\d+)?\s*%?|\w{1,2}'|<unk>|[\wéàèùêôûâòìîç]+|[^\w\s]"
rer_b = re.compile(ur" r e r(?: e r)? b ")
rer_c = re.compile(ur" r e r(?: e r)? c |r e r( e r)? c' est | rer c' est")
rer = re.compile(ur" (e )?r e r(?: e r)? |re r( e r)? |rer e r | r e rer | r e r | r e rer |r( e r)+ ")
sncf = re.compile(ur" s n c f ")
jusq = re.compile(ur" jusqu ' ")
ratp = re.compile(ur" r a t(?: p)? ")
quel = re.compile(ur" quelqu ' ")
space = re.compile(ur" +")
tok2 = nltk.RegexpTokenizer(pattern,flags=re.UNICODE )
# (?x)\d+(\.\d+)?\s*%| \w'| \w+| [^\w\s]

def preproc(line):
#   print 1,line.encode('utf8')
line = space.subn(u" ",line)[0]
line = rer_b.subn(u" rer b ",line)[0]
line = rer_c.subn(u" rer c ",line)[0]
line = rer.subn(u" rer ",line)[0]
line = sncf.subn(u" sncf ",line)[0]
line = ratp.subn(u" ratp ",line)[0]
line = jusq.subn(u" jusqu' ",line)[0]
line = quel.subn(u" quelqu' ",line)[0]
line = space.subn(u" ",line)[0]
# print 2,line.encode('utf8')
return line.lower()

def yield_corpus(df_list):
for corpus in df_list:
for id,doc in corpus.iterrows():
try:
a = tok2.tokenize(preproc(doc[2].decode("utf-8")))
#          print 3, " ".join(a).encode("utf8")
yield a
except:
print doc[2]
raise
def select(elm):
return int(elm.split("_")[-1])

def select_mmf(elm):
return int(elm.split("_")[0])``````