utils.py
3.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
# -*- coding: utf-8 -*-
import nltk
import re
import codecs
import numpy as np
import sqlite3
pattern = ur"\d+(?:\.\d+)?\s*%?|\w{1,2}'|<unk>|[\wéàèùêôûâòìîç]+|[^\w\s]"
rer_b = re.compile(ur" r e r(?: e r)? b ")
rer_c = re.compile(ur" r e r(?: e r)? c |r e r( e r)? c' est | rer c' est")
rer = re.compile(ur" (e )?r e r(?: e r)? |re r( e r)? |rer e r | r e rer | r e r | r e rer |r( e r)+ ")
sncf = re.compile(ur" s n c f ")
jusq = re.compile(ur" jusqu ' ")
ratp = re.compile(ur" r a t(?: p)? ")
quel = re.compile(ur" quelqu ' ")
space = re.compile(ur" +")
tok2 = nltk.RegexpTokenizer(pattern,flags=re.UNICODE )
# (?x)\d+(\.\d+)?\s*%| \w'| \w+| [^\w\s]
def preproc(line):
# print 1,line.encode('utf8')
line = space.subn(u" ",line)[0]
line = rer_b.subn(u" rer b ",line)[0]
line = rer_c.subn(u" rer c ",line)[0]
line = rer.subn(u" rer ",line)[0]
line = sncf.subn(u" sncf ",line)[0]
line = ratp.subn(u" ratp ",line)[0]
line = jusq.subn(u" jusqu' ",line)[0]
line = quel.subn(u" quelqu' ",line)[0]
line = space.subn(u" ",line)[0]
# print 2,line.encode('utf8')
return line.lower()
def yield_corpus(df_list):
for corpus in df_list:
for id,doc in corpus.iterrows():
try:
a = tok2.tokenize(preproc(doc[2].decode("utf-8")))
# print 3, " ".join(a).encode("utf8")
yield a
except:
print doc[2]
raise
def select(elm):
return int(elm.split("_")[-1])
def select_mmf(elm):
return int(elm.split("_")[0])
def get_score(table):
mx_train = np.max(table[0])
argmx_dev = np.argmax(table[1])
mx_dev = table[1][argmx_dev]
best_test = table[2][argmx_dev]
mx_test = np.max(table[2])
print """\tmax train : {}
\tmax dev : {}
\tmax test : {} - best test : {}
\t best epochs : {}""".format(mx_train,mx_dev,mx_test,best_test,argmx_dev)
return mx_train,mx_dev,mx_test,best_test,argmx_dev
class WeightedWordsList :
@staticmethod
def get_key(wtuple):
return wtuple[1]
@staticmethod
def get_okey(wtuple):
return wtuple[1][1]
def __init__(self,file_path):
self.wlist = codecs.open(file_path,"r","utf8").readlines()
self.wlist = [x.strip().split(':') for x in self.wlist ]
self.wlist = [ (x, float(y)) for x,y in self.wlist ]
self.wdict = {}
for x,y in self.wlist:
self.wdict[x.encode("utf8")] = y
def select_best(self,word_list,lenght=5):
scored_word = []
for w in word_list:
w = w.encode("utf8")
if w not in self.wdict :
continue
if len(scored_word) < lenght:
scored_word.append((w,self.wdict[w]))
else :
w_min= min(enumerate(scored_word),key=WeightedWordsList.get_okey)
w_curr = (w, self.wdict[w])
if w_min[1][1] < w_curr[1]:
del scored_word[w_min[0]]
scored_word.append(w_curr)
w_min = min(enumerate(scored_word),key=WeightedWordsList.get_okey)
while len(scored_word) > lenght and w_min[1][1] < w_curr[1] :
del scored_word[w_min[0]]
w_min = min(enumerate(scored_word),key=WeightedWordsList.get_okey)
elif w_min[1][1] == w_curr[1]:
scored_word.append(w_curr)
return [ w[0] for w in scored_word ]