Killian / decodopr

Blame view

LDA/utils.py 3.43 KB

b6d0165d1 Killian Initial commit	1 2 3	# -- coding: utf-8 -- import nltk import re
2af8e57f4 Killian change all	4 5 6	import codecs import numpy as np import sqlite3
b6d0165d1 Killian Initial commit	7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44	pattern = ur"\d+(?:\.\d+)?\s%?\|\w{1,2}'\|<unk>\|[\wéàèùêôûâòìîç]+\|[^\w\s]" rer_b = re.compile(ur" r e r(?: e r)? b ") rer_c = re.compile(ur" r e r(?: e r)? c \|r e r( e r)? c' est \| rer c' est") rer = re.compile(ur" (e )?r e r(?: e r)? \|re r( e r)? \|rer e r \| r e rer \| r e r \| r e rer \|r( e r)+ ") sncf = re.compile(ur" s n c f ") jusq = re.compile(ur" jusqu ' ") ratp = re.compile(ur" r a t(?: p)? ") quel = re.compile(ur" quelqu ' ") space = re.compile(ur" +") tok2 = nltk.RegexpTokenizer(pattern,flags=re.UNICODE ) # (?x)\d+(\.\d+)?\s%\| \w'\| \w+\| [^\w\s] def preproc(line): # print 1,line.encode('utf8') line = space.subn(u" ",line)[0] line = rer_b.subn(u" rer b ",line)[0] line = rer_c.subn(u" rer c ",line)[0] line = rer.subn(u" rer ",line)[0] line = sncf.subn(u" sncf ",line)[0] line = ratp.subn(u" ratp ",line)[0] line = jusq.subn(u" jusqu' ",line)[0] line = quel.subn(u" quelqu' ",line)[0] line = space.subn(u" ",line)[0] # print 2,line.encode('utf8') return line.lower() def yield_corpus(df_list): for corpus in df_list: for id,doc in corpus.iterrows(): try: a = tok2.tokenize(preproc(doc[2].decode("utf-8"))) # print 3, " ".join(a).encode("utf8") yield a except: print doc[2] raise def select(elm): return int(elm.split("_")[-1])
7db73861f Killian add vae et mmf	45 46 47 48	def select_mmf(elm): return int(elm.split("_")[0])
2af8e57f4 Killian change all	49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99	def get_score(table): mx_train = np.max(table[0]) argmx_dev = np.argmax(table[1]) mx_dev = table[1][argmx_dev] best_test = table[2][argmx_dev] mx_test = np.max(table[2]) print """\tmax train : {} \tmax dev : {} \tmax test : {} - best test : {} \t best epochs : {}""".format(mx_train,mx_dev,mx_test,best_test,argmx_dev) return mx_train,mx_dev,mx_test,best_test,argmx_dev class WeightedWordsList : @staticmethod def get_key(wtuple): return wtuple[1] @staticmethod def get_okey(wtuple): return wtuple[1][1] def __init__(self,file_path): self.wlist = codecs.open(file_path,"r","utf8").readlines() self.wlist = [x.strip().split(':') for x in self.wlist ] self.wlist = [ (x, float(y)) for x,y in self.wlist ] self.wdict = {} for x,y in self.wlist: self.wdict[x.encode("utf8")] = y def select_best(self,word_list,lenght=5): scored_word = [] for w in word_list: w = w.encode("utf8") if w not in self.wdict : continue if len(scored_word) < lenght: scored_word.append((w,self.wdict[w])) else : w_min= min(enumerate(scored_word),key=WeightedWordsList.get_okey) w_curr = (w, self.wdict[w]) if w_min[1][1] < w_curr[1]: del scored_word[w_min[0]] scored_word.append(w_curr) w_min = min(enumerate(scored_word),key=WeightedWordsList.get_okey) while len(scored_word) > lenght and w_min[1][1] < w_curr[1] : del scored_word[w_min[0]] w_min = min(enumerate(scored_word),key=WeightedWordsList.get_okey) elif w_min[1][1] == w_curr[1]: scored_word.append(w_curr) return [ w[0] for w in scored_word ]