utils.py
1.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# -*- coding: utf-8 -*-
import nltk
import re
pattern = ur"\d+(?:\.\d+)?\s*%?|\w{1,2}'|<unk>|[\wéàèùêôûâòìîç]+|[^\w\s]"
rer_b = re.compile(ur" r e r(?: e r)? b ")
rer_c = re.compile(ur" r e r(?: e r)? c |r e r( e r)? c' est | rer c' est")
rer = re.compile(ur" (e )?r e r(?: e r)? |re r( e r)? |rer e r | r e rer | r e r | r e rer |r( e r)+ ")
sncf = re.compile(ur" s n c f ")
jusq = re.compile(ur" jusqu ' ")
ratp = re.compile(ur" r a t(?: p)? ")
quel = re.compile(ur" quelqu ' ")
space = re.compile(ur" +")
tok2 = nltk.RegexpTokenizer(pattern,flags=re.UNICODE )
# (?x)\d+(\.\d+)?\s*%| \w'| \w+| [^\w\s]
def preproc(line):
# print 1,line.encode('utf8')
line = space.subn(u" ",line)[0]
line = rer_b.subn(u" rer b ",line)[0]
line = rer_c.subn(u" rer c ",line)[0]
line = rer.subn(u" rer ",line)[0]
line = sncf.subn(u" sncf ",line)[0]
line = ratp.subn(u" ratp ",line)[0]
line = jusq.subn(u" jusqu' ",line)[0]
line = quel.subn(u" quelqu' ",line)[0]
line = space.subn(u" ",line)[0]
# print 2,line.encode('utf8')
return line.lower()
def yield_corpus(df_list):
for corpus in df_list:
for id,doc in corpus.iterrows():
try:
a = tok2.tokenize(preproc(doc[2].decode("utf-8")))
# print 3, " ".join(a).encode("utf8")
yield a
except:
print doc[2]
raise
def select(elm):
return int(elm.split("_")[-1])
def select_mmf(elm):
return int(elm.split("_")[0])