Blame view
LDA/utils.py
1.49 KB
b6d0165d1 Initial commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
# -*- coding: utf-8 -*- import nltk import re pattern = ur"\d+(?:\.\d+)?\s*%?|\w{1,2}'|<unk>|[\wéàèùêôûâòìîç]+|[^\w\s]" rer_b = re.compile(ur" r e r(?: e r)? b ") rer_c = re.compile(ur" r e r(?: e r)? c |r e r( e r)? c' est | rer c' est") rer = re.compile(ur" (e )?r e r(?: e r)? |re r( e r)? |rer e r | r e rer | r e r | r e rer |r( e r)+ ") sncf = re.compile(ur" s n c f ") jusq = re.compile(ur" jusqu ' ") ratp = re.compile(ur" r a t(?: p)? ") quel = re.compile(ur" quelqu ' ") space = re.compile(ur" +") tok2 = nltk.RegexpTokenizer(pattern,flags=re.UNICODE ) # (?x)\d+(\.\d+)?\s*%| \w'| \w+| [^\w\s] def preproc(line): # print 1,line.encode('utf8') line = space.subn(u" ",line)[0] line = rer_b.subn(u" rer b ",line)[0] line = rer_c.subn(u" rer c ",line)[0] line = rer.subn(u" rer ",line)[0] line = sncf.subn(u" sncf ",line)[0] line = ratp.subn(u" ratp ",line)[0] line = jusq.subn(u" jusqu' ",line)[0] line = quel.subn(u" quelqu' ",line)[0] line = space.subn(u" ",line)[0] # print 2,line.encode('utf8') return line.lower() def yield_corpus(df_list): for corpus in df_list: for id,doc in corpus.iterrows(): try: a = tok2.tokenize(preproc(doc[2].decode("utf-8"))) # print 3, " ".join(a).encode("utf8") yield a except: print doc[2] raise def select(elm): return int(elm.split("_")[-1]) |
7db73861f add vae et mmf |
42 43 44 45 |
def select_mmf(elm): return int(elm.split("_")[0]) |