utils.py 3.43 KB
# -*- coding: utf-8 -*-
import nltk
import re
import codecs
import numpy as np
import sqlite3

pattern =  ur"\d+(?:\.\d+)?\s*%?|\w{1,2}'|<unk>|[\wéàèùêôûâòìîç]+|[^\w\s]"
rer_b = re.compile(ur" r e r(?: e r)? b ")
rer_c = re.compile(ur" r e r(?: e r)? c |r e r( e r)? c' est | rer c' est")
rer = re.compile(ur" (e )?r e r(?: e r)? |re r( e r)? |rer e r | r e rer | r e r | r e rer |r( e r)+ ")
sncf = re.compile(ur" s n c f ")
jusq = re.compile(ur" jusqu ' ")
ratp = re.compile(ur" r a t(?: p)? ")
quel = re.compile(ur" quelqu ' ")
space = re.compile(ur" +")
tok2 = nltk.RegexpTokenizer(pattern,flags=re.UNICODE )
# (?x)\d+(\.\d+)?\s*%| \w'| \w+| [^\w\s]

def preproc(line):
    #   print 1,line.encode('utf8')
    line = space.subn(u" ",line)[0]
    line = rer_b.subn(u" rer b ",line)[0]
    line = rer_c.subn(u" rer c ",line)[0]
    line = rer.subn(u" rer ",line)[0]
    line = sncf.subn(u" sncf ",line)[0]
    line = ratp.subn(u" ratp ",line)[0]
    line = jusq.subn(u" jusqu' ",line)[0]
    line = quel.subn(u" quelqu' ",line)[0]
    line = space.subn(u" ",line)[0]
    # print 2,line.encode('utf8')
    return line.lower()

def yield_corpus(df_list):
    for corpus in df_list:
        for id,doc in corpus.iterrows():
            try:
                a = tok2.tokenize(preproc(doc[2].decode("utf-8")))
      #          print 3, " ".join(a).encode("utf8")
                yield a 
            except:
                print doc[2]
                raise
def select(elm):
    return int(elm.split("_")[-1])


def select_mmf(elm):
    return int(elm.split("_")[0])

def get_score(table):
    mx_train = np.max(table[0])
    argmx_dev = np.argmax(table[1])
    mx_dev = table[1][argmx_dev]
    best_test = table[2][argmx_dev]
    mx_test = np.max(table[2])
    print """\tmax train : {}
    \tmax dev : {}
    \tmax test : {} - best test : {}
    \t best epochs : {}""".format(mx_train,mx_dev,mx_test,best_test,argmx_dev)
    return mx_train,mx_dev,mx_test,best_test,argmx_dev
class WeightedWordsList :
    @staticmethod
    def get_key(wtuple):
        return wtuple[1]
    @staticmethod
    def get_okey(wtuple):
        return wtuple[1][1]


    def __init__(self,file_path):
        self.wlist = codecs.open(file_path,"r","utf8").readlines()
        self.wlist = [x.strip().split(':') for x in self.wlist ]
        self.wlist = [ (x, float(y)) for x,y in self.wlist ]
        self.wdict = {}
        for x,y in self.wlist:
            self.wdict[x.encode("utf8")] = y

    def select_best(self,word_list,lenght=5):
        scored_word = []
        for w in word_list:
            w = w.encode("utf8")
            if w not in self.wdict :
                continue

            if len(scored_word) < lenght:
                scored_word.append((w,self.wdict[w]))
            else :
                w_min= min(enumerate(scored_word),key=WeightedWordsList.get_okey)
                w_curr = (w, self.wdict[w])
                if w_min[1][1] < w_curr[1]:
                    del scored_word[w_min[0]]
                    scored_word.append(w_curr)
                    w_min = min(enumerate(scored_word),key=WeightedWordsList.get_okey)
                    while len(scored_word) > lenght and w_min[1][1] < w_curr[1] :
                        del scored_word[w_min[0]]
                        w_min = min(enumerate(scored_word),key=WeightedWordsList.get_okey)
                elif w_min[1][1] == w_curr[1]:
                    scored_word.append(w_curr)
        return [ w[0] for w in scored_word ]