Blame view
LDA/utils.py
3.43 KB
b6d0165d1 Initial commit |
1 2 3 |
# -*- coding: utf-8 -*- import nltk import re |
2af8e57f4 change all |
4 5 6 |
import codecs import numpy as np import sqlite3 |
b6d0165d1 Initial commit |
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
pattern = ur"\d+(?:\.\d+)?\s*%?|\w{1,2}'|<unk>|[\wéàèùêôûâòìîç]+|[^\w\s]" rer_b = re.compile(ur" r e r(?: e r)? b ") rer_c = re.compile(ur" r e r(?: e r)? c |r e r( e r)? c' est | rer c' est") rer = re.compile(ur" (e )?r e r(?: e r)? |re r( e r)? |rer e r | r e rer | r e r | r e rer |r( e r)+ ") sncf = re.compile(ur" s n c f ") jusq = re.compile(ur" jusqu ' ") ratp = re.compile(ur" r a t(?: p)? ") quel = re.compile(ur" quelqu ' ") space = re.compile(ur" +") tok2 = nltk.RegexpTokenizer(pattern,flags=re.UNICODE ) # (?x)\d+(\.\d+)?\s*%| \w'| \w+| [^\w\s] def preproc(line): # print 1,line.encode('utf8') line = space.subn(u" ",line)[0] line = rer_b.subn(u" rer b ",line)[0] line = rer_c.subn(u" rer c ",line)[0] line = rer.subn(u" rer ",line)[0] line = sncf.subn(u" sncf ",line)[0] line = ratp.subn(u" ratp ",line)[0] line = jusq.subn(u" jusqu' ",line)[0] line = quel.subn(u" quelqu' ",line)[0] line = space.subn(u" ",line)[0] # print 2,line.encode('utf8') return line.lower() def yield_corpus(df_list): for corpus in df_list: for id,doc in corpus.iterrows(): try: a = tok2.tokenize(preproc(doc[2].decode("utf-8"))) # print 3, " ".join(a).encode("utf8") yield a except: print doc[2] raise def select(elm): return int(elm.split("_")[-1]) |
7db73861f add vae et mmf |
45 46 47 48 |
def select_mmf(elm): return int(elm.split("_")[0]) |
2af8e57f4 change all |
49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 |
def get_score(table): mx_train = np.max(table[0]) argmx_dev = np.argmax(table[1]) mx_dev = table[1][argmx_dev] best_test = table[2][argmx_dev] mx_test = np.max(table[2]) print """\tmax train : {} \tmax dev : {} \tmax test : {} - best test : {} \t best epochs : {}""".format(mx_train,mx_dev,mx_test,best_test,argmx_dev) return mx_train,mx_dev,mx_test,best_test,argmx_dev class WeightedWordsList : @staticmethod def get_key(wtuple): return wtuple[1] @staticmethod def get_okey(wtuple): return wtuple[1][1] def __init__(self,file_path): self.wlist = codecs.open(file_path,"r","utf8").readlines() self.wlist = [x.strip().split(':') for x in self.wlist ] self.wlist = [ (x, float(y)) for x,y in self.wlist ] self.wdict = {} for x,y in self.wlist: self.wdict[x.encode("utf8")] = y def select_best(self,word_list,lenght=5): scored_word = [] for w in word_list: w = w.encode("utf8") if w not in self.wdict : continue if len(scored_word) < lenght: scored_word.append((w,self.wdict[w])) else : w_min= min(enumerate(scored_word),key=WeightedWordsList.get_okey) w_curr = (w, self.wdict[w]) if w_min[1][1] < w_curr[1]: del scored_word[w_min[0]] scored_word.append(w_curr) w_min = min(enumerate(scored_word),key=WeightedWordsList.get_okey) while len(scored_word) > lenght and w_min[1][1] < w_curr[1] : del scored_word[w_min[0]] w_min = min(enumerate(scored_word),key=WeightedWordsList.get_okey) elif w_min[1][1] == w_curr[1]: scored_word.append(w_curr) return [ w[0] for w in scored_word ] |