utils.py 1.44 KB
# -*- coding: utf-8 -*-
import nltk
import re
pattern =  ur"\d+(?:\.\d+)?\s*%?|\w{1,2}'|<unk>|[\wéàèùêôûâòìîç]+|[^\w\s]"
rer_b = re.compile(ur" r e r(?: e r)? b ")
rer_c = re.compile(ur" r e r(?: e r)? c |r e r( e r)? c' est | rer c' est")
rer = re.compile(ur" (e )?r e r(?: e r)? |re r( e r)? |rer e r | r e rer | r e r | r e rer |r( e r)+ ")
sncf = re.compile(ur" s n c f ")
jusq = re.compile(ur" jusqu ' ")
ratp = re.compile(ur" r a t(?: p)? ")
quel = re.compile(ur" quelqu ' ")
space = re.compile(ur" +")
tok2 = nltk.RegexpTokenizer(pattern,flags=re.UNICODE )
# (?x)\d+(\.\d+)?\s*%| \w'| \w+| [^\w\s]

def preproc(line):
    #   print 1,line.encode('utf8')
    line = space.subn(u" ",line)[0]
    line = rer_b.subn(u" rer b ",line)[0]
    line = rer_c.subn(u" rer c ",line)[0]
    line = rer.subn(u" rer ",line)[0]
    line = sncf.subn(u" sncf ",line)[0]
    line = ratp.subn(u" ratp ",line)[0]
    line = jusq.subn(u" jusqu' ",line)[0]
    line = quel.subn(u" quelqu' ",line)[0]
    line = space.subn(u" ",line)[0]
    # print 2,line.encode('utf8')
    return line.lower()

def yield_corpus(df_list):
    for corpus in df_list:
        for id,doc in corpus.iterrows():
            try:
                a = tok2.tokenize(preproc(doc[2].decode("utf-8")))
      #          print 3, " ".join(a).encode("utf8")
                yield a 
            except:
                print doc[2]
                raise
def select(elm):
    return int(elm.split("_")[-1])