Commit 7db73861ffbab3f3f51b17188d8894a512b36264

Authored by Killian
1 parent b6d0165d16
Exists in master

add vae et mmf

Showing 13 changed files with 1084 additions and 44 deletions Inline Diff

LDA/00-mmf_make_features.py
File was created 1 import sys
2 import os
3
4 import pandas
5 import numpy
6 import shelve
7
8 from sklearn.preprocessing import LabelBinarizer
9
10 from utils import select_mmf as select
11
12 input_dir = sys.argv[1] # Dossier de premire niveau contient ASR et TRS
13 level = sys.argv[2] # taille de LDA ( -5) voulu
14
15 lb=LabelBinarizer()
16 #y_train=lb.fit_transform([utils.select(ligneid) for ligneid in origin_corps["LABEL"]["TRAIN"]])
17
18
19 data = shelve.open("{}/mmf_{}.shelve".format(input_dir,level))
20 data["LABEL"]= {"LDA":{}}
21 for mod in ["ASR", "TRS" ]
22 train = pandas.read_table("{}/{}/train_{}.ssv".format(input_dir, mod, level), sep=" ", header=None )
23 dev = pandas.read_table("{}/{}/dev_{}.ssv".format(input_dir, mod, level), sep=" ", header=None )
24 test = pandas.read_table("{}/{}/test_{}.ssv".format(input_dir, mod, level), sep=" ", header=None )
25
26 y_train = train.iloc[:,0].apply(select)
27 y_dev = dev.iloc[:,0].apply(select)
28 y_test = test.iloc[:,0].apply(select)
29 lb.fit(y_train)
30 data["LABEL"][mod]={"TRAIN":lb.transform(y_train),"DEV":lb.transform(y_dev), "TEST": lb.transform(y_test)}
31
32 data["LDA"][mod]={}
33 data["LDA"][mod]["TRAIN"]=train.iloc[:,1:].values
34 data["LDA"][mod]["DEV"]=dev.iloc[:,1:].values
35 data["LDA"][mod]["TEST"]=test.iloc[:,1:].values
36
37 data.sync()
38 data.close()
39
1 import gensim 1 import gensim
2 import os 2 import os
3 import sys 3 import sys
4 import pickle 4 import pickle
5 from gensim.models.ldamodel import LdaModel 5 from gensim.models.ldamodel import LdaModel
6 from gensim.models.ldamulticore import LdaMulticore 6 from gensim.models.ldamulticore import LdaMulticore
7 from collections import Counter 7 from collections import Counter
8 import numpy as np 8 import numpy as np
9 import codecs 9 import codecs
10 import shelve 10 import shelve
11 import logging 11 import logging
12 import dill 12 import dill
13 from tinydb import TinyDB, where, Query 13 from tinydb import TinyDB, where, Query
14 import time 14 import time
15 from joblib import Parallel, delayed
15 16
16 def calc_perp(models,train): 17 def calc_perp(models,train):
17 18
18 19
19 stop_words=models[1] 20 stop_words=models[1]
20 name = models[0] 21 name = models[0]
21 22
22 logging.warning(" go {} ".format(name)) 23 logging.warning(" go {} ".format(name))
23 logging.warning("TRS to be done") 24 logging.warning("TRS to be done")
24 entry = Query() 25 entry = Query()
25 value=db.search(entry.name == name) 26 value=db.search(entry.name == name)
26 if len(value) > 0 : 27 if len(value) > 0 :
27 logging.warning("{} already done".format(name)) 28 logging.warning("{} already done".format(name))
28 return 29 return
29 30
30 dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] 31 dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]]
31 lda_trs = models[2] 32 lda_trs = models[2]
32 perp_trs = lda_trs.log_perplexity(dev_trs) 33 perp_trs = lda_trs.log_perplexity(dev_trs)
33 34
34 logging.warning("ASR to be done") 35 logging.warning("ASR to be done")
35 dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] 36 dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]]
36 lda_asr = models[5] 37 lda_asr = models[5]
37 perp_asr = lda_asr.log_perplexity(dev_asr) 38 perp_asr = lda_asr.log_perplexity(dev_asr)
38 logging.warning("ASR saving") 39 logging.warning("ASR saving")
39 res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs } 40 res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs }
40 return res_dict 41 return res_dict
41 42
42 43
43 44
44 45
45 def train_lda(out_dir,train,size,it,sw_size,alpha,eta,passes,chunk): 46 def train_lda(out_dir,train,size,it,sw_size,alpha,eta,passes,chunk):
46 name = "s{}_it{}_sw{}_a{}_e{}_p{}_c{}".format(size,it,sw_size,alpha,eta,passes,chunk) 47 name = "s{}_it{}_sw{}_a{}_e{}_p{}_c{}".format(size,it,sw_size,alpha,eta,passes,chunk)
47 logging.warning(name) 48 logging.warning(name)
48 if os.path.isfile(out_dir+"/"+name+".dill"): 49 deep_out_dir = out_dir+"/"+name
50 if os.path.isdir(deep_out_dir):
49 logging.error(name+" already done") 51 logging.error(name+" already done")
50 return 52 return
51 logging.warning(name+" to be done") 53 logging.warning(name+" to be done")
52 asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) 54 asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y])
53 trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) 55 trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y])
54 asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] 56 asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ]
55 trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] 57 trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ]
56 stop_words=set(asr_sw) | set(trs_sw) 58 stop_words=set(asr_sw) | set(trs_sw)
57 stop_words=[ x.strip() for x in open("french.txt").readlines() ]
58 59
59 logging.warning("TRS to be done") 60 logging.warning("TRS to be done")
60 61
61 lda_trs = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=chunk,iterations=it,alpha=alpha,eta=eta,passes=passes) 62 lda_trs = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=chunk,iterations=it,alpha=alpha,eta=eta,passes=passes)
62 63
63 logging.warning("ASR to be done") 64 logging.warning("ASR to be done")
64 lda_asr = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=chunk,iterations=it,alpha=alpha,eta=eta,passes=passes) 65 lda_asr = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=chunk,iterations=it,alpha=alpha,eta=eta,passes=passes)
65 66
66 dico = train["vocab"] 67 dico = train["vocab"]
67 word_list = [ dico[x] for x in range(len(train["vocab"]))] 68 word_list = [ dico[x] for x in range(len(train["vocab"]))]
68 asr_probs = [] 69 asr_probs = []
69 for line in lda_asr.expElogbeta: 70 for line in lda_asr.expElogbeta:
70 nline = line / np.sum(line) 71 nline = line / np.sum(line)
71 asr_probs.append( str(x) for x in nline) 72 asr_probs.append([ str(x) for x in nline])
72 trs_probs = [] 73 trs_probs = []
73 for line in lda_trs.expElogbeta: 74 for line in lda_trs.expElogbeta:
74 nline = line / np.sum(line) 75 nline = line / np.sum(line)
75 trs_probs.append( str(x) for x in nline) 76 trs_probs.append([str(x) for x in nline])
76 77
77 K = lda_asr.num_topics 78 K = lda_asr.num_topics
78 topicWordProbMat_asr = lda_asr.print_topics(K,10) 79 topicWordProbMat_asr = lda_asr.print_topics(K,10)
79 80
80 K = lda_trs.num_topics 81 K = lda_trs.num_topics
81 topicWordProbMat_trs = lda_trs.print_topics(K,10) 82 topicWordProbMat_trs = lda_trs.print_topics(K,10)
83 os.mkdir(deep_out_dir)
84 dill.dump([x for x in stop_words],open(deep_out_dir+"/stopwords.dill","w"))
85 lda_asr.save(deep_out_dir+"/lda_asr.model")
86 lda_trs.save(deep_out_dir+"/lda_trs.model")
87 dill.dump([x for x in asr_probs],open(deep_out_dir+"/lda_asr_probs.dill","w"))
88 dill.dump([x for x in trs_probs],open(deep_out_dir+"/lda_trs_probs.dill","w"))
89
82 return [name, stop_words, lda_asr , asr_probs , topicWordProbMat_asr, lda_trs, trs_probs, topicWordProbMat_trs] 90 return [name, stop_words, lda_asr , asr_probs , topicWordProbMat_asr, lda_trs, trs_probs, topicWordProbMat_trs]
83 91
92 def train_one(name,train,s,i,sw,a,e,p,c):
93 st=time.time()
94 logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]]))
95 models = train_lda(name,train,s,i,sw,a,e,p,c)
96 if models:
97 m = calc_perp(models,train)
98 #dill.dump(models,open("{}/{}.dill".format(name,models[0]),"wb"))
99 else :
100 m = None
101 e = time.time()
102 logging.warning("fin en : {}".format(e-st))
103 return m
104
105
106
107
84 if __name__ == "__main__": 108 if __name__ == "__main__":
85 logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) 109 logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
86 110
87 input_shelve = sys.argv[1] 111 input_shelve = sys.argv[1]
88 db_path = sys.argv[2] 112 db_path = sys.argv[2]
89 size = [ int(x) for x in sys.argv[3].split("_")] 113 size = [ int(x) for x in sys.argv[3].split("_")]
90 workers = int(sys.argv[4]) 114 workers = int(sys.argv[4])
91 name = sys.argv[5] 115 name = sys.argv[5]
92 it = [ int(x) for x in sys.argv[6].split("_")] 116 it = [ int(x) for x in sys.argv[6].split("_")]
93 sw_size = [ int(x) for x in sys.argv[7].split("_")] 117 sw_size = [ int(x) for x in sys.argv[7].split("_")]
94 if sys.argv[8] != "None" : 118 if sys.argv[8] != "None" :
95 alpha = [ "symmetric", "auto" ] + [ float(x) for x in sys.argv[8].split("_")] 119 alpha = [ "symmetric", "auto" ] + [ float(x) for x in sys.argv[8].split("_")]
96 eta = ["auto"] + [ float(x) for x in sys.argv[9].split("_")] 120 eta = ["auto"] + [ float(x) for x in sys.argv[9].split("_")]
97 else : 121 else :
98 alpha = ["symmetric"] 122 alpha = ["symmetric"]
99 eta = ["auto"] 123 eta = ["auto"]
100 passes = [ int(x) for x in sys.argv[10].split("_")] 124 passes = [ int(x) for x in sys.argv[10].split("_")]
101 chunk = [ int(x) for x in sys.argv[11].split("_")] 125 chunk = [ int(x) for x in sys.argv[11].split("_")]
102 126
103 #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir))) 127 #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir)))
104 train = shelve.open(input_shelve) 128 train = shelve.open(input_shelve)
105 try : 129 try :
106 os.mkdir(name) 130 os.mkdir(name)
107 except : 131 except :
108 logging.warning(" folder already existe " ) 132 logging.warning(" folder already existe " )
109 db = TinyDB(db_path) 133 db = TinyDB(db_path)
110 nb_model = len(passes) * len(chunk) * len(it) * len(sw_size) * len(alpha) * len(eta) * len(size) 134 nb_model = len(passes) * len(chunk) * len(it) * len(sw_size) * len(alpha) * len(eta) * len(size)
111 logging.warning(" hey will train {} models ".format(nb_model)) 135 logging.warning(" hey will train {} models ".format(nb_model))
136
137 args_list=[]
112 for p in passes: 138 for p in passes:
113 for c in chunk: 139 for c in chunk:
114 for i in it : 140 for i in it :
115 for sw in sw_size: 141 for sw in sw_size:
116 for a in alpha: 142 for a in alpha:
117 for e in eta: 143 for e in eta:
118 for s in size: 144 for s in size:
119 st=time.time() 145 args_list.append((name,train,s,i,sw,a,e,p,c))
120 logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]])) 146 res_list= Parallel(n_jobs=15)(delayed(train_one)(*args) for args in args_list)
121 models = train_lda(name,train,s,i,sw,a,e,p,c) 147 for m in res_list :
122 if models: 148 db.insert(m)
123 m = calc_perp(models,train) 149
LDA/03-mono_perplex.py
1 import gensim 1 import gensim
2 import time 2 import time
3 import os 3 import os
4 import sys 4 import sys
5 import pickle 5 import pickle
6 from gensim.models.ldamodel import LdaModel 6 from gensim.models.ldamodel import LdaModel
7 from gensim.models.ldamulticore import LdaMulticore 7 from gensim.models.ldamulticore import LdaMulticore
8 from collections import Counter 8 from collections import Counter
9 import numpy as np 9 import numpy as np
10 import codecs 10 import codecs
11 import shelve 11 import shelve
12 import logging 12 import logging
13 import glob 13 import glob
14 from tinydb import TinyDB, where, Query 14 from tinydb import TinyDB, where, Query
15 15
16 16
17 def calc_perp(in_dir,train): 17 def calc_perp(in_dir,train):
18 name = in_dir.split("/")[-1] 18 name = in_dir.split("/")[-1]
19 # s40_it1_sw50_a0.01_e0.1_p6_c1000 19 # s40_it1_sw50_a0.01_e0.1_p6_c1000
20 sw_size = int(name.split("_")[2][2:]) 20 sw_size = int(name.split("_")[2][2:])
21 21
22 logging.warning(" go {} ".format(name)) 22 logging.warning(" go {} ".format(name))
23 23
24 24
25 logging.warning("Redo Vocab and stop") 25 logging.warning("Redo Vocab and stop")
26 asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) 26 asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y])
27 trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) 27 trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y])
28 asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] 28 asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ]
29 trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] 29 trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ]
30 stop_words=set(asr_sw) | set(trs_sw) 30 stop_words=set(asr_sw) | set(trs_sw)
31 31
32 logging.warning("TRS to be done") 32 logging.warning("TRS to be done")
33 entry = Query() 33 entry = Query()
34 value=db.search(entry.name == name) 34 value=db.search(entry.name == name)
35 if len(value) > 0 : 35 if len(value) > 0 :
36 logging.warning("{} already done".format(name)) 36 logging.warning("{} already done".format(name))
37 return 37 return
38 38
39 dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] 39 dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]]
40 lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir)) 40 lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir))
41 perp_trs = lda_trs.log_perplexity(dev_trs) 41 perp_trs = lda_trs.log_perplexity(dev_trs)
42 logging.warning("ASR to be done") 42 logging.warning("ASR to be done")
43 dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] 43 dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]]
44 lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir)) 44 lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir))
45 perp_asr = lda_asr.log_perplexity(dev_asr) 45 perp_asr = lda_asr.log_perplexity(dev_asr)
46 logging.warning("ASR saving") 46 logging.warning("ASR saving")
47 res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs} 47 res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs}
48 return res_dict 48 return res_dict
49 49
50 if __name__ == "__main__": 50 if __name__ == "__main__":
51 input_shelve = sys.argv[1] 51 input_shelve = sys.argv[1]
52 input_dir = sys.argv[2] 52 input_dir = sys.argv[2]
53 db_path = sys.argv[3] 53 db_path = sys.argv[3]
54 logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) 54 logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
55 folders = glob.glob("{}/*".format(input_dir)) 55 folders = glob.glob("{}/s*".format(input_dir))
56 56
57 #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir))) 57 #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir)))
58 train = shelve.open(input_shelve) 58 train = shelve.open(input_shelve)
59 db = TinyDB(db_path) 59 db = TinyDB(db_path)
60 for indx, folder in enumerate(folders) : 60 for indx, folder in enumerate(folders) :
61 s = time.time() 61 s = time.time()
62 r=calc_perp(folder,train) 62 r=calc_perp(folder,train)
63 if r : 63 if r :
64 db.insert(r) 64 db.insert(r)
65 e = time.time() 65 e = time.time()
66 print "FIN : {} {} : {}".format(folder,indx,e-s) 66 print "FIN : {} {} : {}".format(folder,indx,e-s)
67 67
1 import gensim 1 import gensim
2 import time 2 import time
3 import os 3 import os
4 import sys 4 import sys
5 import pickle 5 import pickle
6 from gensim.models.ldamodel import LdaModel 6 from gensim.models.ldamodel import LdaModel
7 from gensim.models.ldamulticore import LdaMulticore 7 from gensim.models.ldamulticore import LdaMulticore
8 from collections import Counter 8 from collections import Counter
9 import numpy as np 9 import numpy as np
10 import codecs 10 import codecs
11 import shelve 11 import shelve
12 import logging 12 import logging
13 import glob 13 import glob
14 from tinydb import TinyDB, where, Query 14 from tinydb import TinyDB, where, Query
15 from itertools import izip_longest, repeat 15 from itertools import izip_longest, repeat
16 from multiprocessing import Pool 16 from multiprocessing import Pool
17 17
18 def grouper(n, iterable, fillvalue=None): 18 def grouper(n, iterable, fillvalue=None):
19 "grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx" 19 "grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
20 args = [iter(iterable)] * n 20 args = [iter(iterable)] * n
21 return izip_longest(fillvalue=fillvalue, *args) 21 return izip_longest(fillvalue=fillvalue, *args)
22 22
23 23
24 def calc_perp(params): 24 def calc_perp(params):
25 in_dir,train = params 25 try:
26 name = in_dir.split("/")[-1] 26 in_dir,train = params
27 # s40_it1_sw50_a0.01_e0.1_p6_c1000 27 name = in_dir.split("/")[-1]
28 # s40_it1_sw50_a0.01_e0.1_p6_c1000
28 29
29 entry = Query() 30 entry = Query()
30 value=db.search(entry.name == name) 31 value=db.search(entry.name == name)
31 if len(value) > 0 : 32 if len(value) > 0 :
32 logging.warning("{} already done".format(name)) 33 logging.warning("{} already done".format(name))
33 return 34 return
34 35
35 sw_size = int(name.split("_")[2][2:]) 36 sw_size = int(name.split("_")[2][2:])
36 37
37 logging.warning(" go {} ".format(name)) 38 logging.warning(" go {} ".format(name))
38 39
39 40
40 logging.warning("Redo Vocab and stop") 41 logging.warning("Redo Vocab and stop")
41 asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) 42 asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y])
42 trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) 43 trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y])
43 asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] 44 asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ]
44 trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] 45 trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ]
45 stop_words=set(asr_sw) | set(trs_sw) 46 stop_words=set(asr_sw) | set(trs_sw)
46 47
47 logging.warning("TRS to be done") 48 logging.warning("TRS to be done")
48 49
49 dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] 50 dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]]
50 lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir)) 51 lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir))
51 perp_trs = lda_trs.log_perplexity(dev_trs) 52 perp_trs = lda_trs.log_perplexity(dev_trs)
52 logging.warning("ASR to be done") 53 logging.warning("ASR to be done")
53 dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] 54 dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]]
54 lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir)) 55 lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir))
55 perp_asr = lda_asr.log_perplexity(dev_asr) 56 perp_asr = lda_asr.log_perplexity(dev_asr)
56 logging.warning("ASR saving") 57 logging.warning("ASR saving")
57 res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs} 58 res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs}
58 return res_dict 59 return res_dict
60 except :
61 return { "name" : name }
59 62
60 if __name__ == "__main__": 63 if __name__ == "__main__":
61 input_shelve = sys.argv[1] 64 input_shelve = sys.argv[1]
62 input_dir = sys.argv[2] 65 input_dir = sys.argv[2]
63 db_path = sys.argv[3] 66 db_path = sys.argv[3]
64 logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) 67 logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
65 folders = glob.glob("{}/*".format(input_dir)) 68 folders = glob.glob("{}/*".format(input_dir))
66 69
67 #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir))) 70 #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir)))
68 train = dict(shelve.open(input_shelve)) 71 train = dict(shelve.open(input_shelve))
69 db = TinyDB(db_path) 72 db = TinyDB(db_path)
70 names = [ x["name"] for x in db.all()] 73 names = [ x["name"] for x in db.all()]
71 p = Pool(processes=14,maxtasksperchild=10) 74 p = Pool(processes=14,maxtasksperchild=10)
72 75
73 s = time.time() 76 s = time.time()
74 perplexs = p.map(calc_perp,zip(folders,repeat(train,len(folders)))) 77 perplexs = p.map(calc_perp,zip(folders,repeat(train,len(folders))))
75 78
76 for indx, perp in enumerate(perplexs) : 79 for indx, perp in enumerate(perplexs) :
77 if perp : 80 if perp :
78 db.insert(perp) 81 db.insert(perp)
79 e = time.time() 82 e = time.time()
80 print "FIN : {} : {}".format(indx,e-s) 83 print "FIN : {} : {}".format(indx,e-s)
81 84
File was created 1
2 # coding: utf-8
3
4 # In[29]:
5
6 # Import
7 import itertools
8 import shelve
9 import pickle
10 import numpy
11 import scipy
12 from scipy import sparse
13 import scipy.sparse
14 import scipy.io
15 from mlp import *
16 import mlp
17 import sys
18 import utils
19 import dill
20 from collections import Counter
21 from gensim.models import LdaModel
22
23
24
25 # In[3]:
26
27 #30_50_50_150_0.0001
28
29 # In[4]:
30
31 #db=shelve.open("SPELIKE_MLP_DB.shelve",writeback=True)
32 origin_corps=shelve.open("{}".format(sys.argv[2]))
33 in_dir = sys.argv[1]
34
35
36 out_db=shelve.open("{}/mlp_scores.shelve".format(in_dir),writeback=True)
37
38 mlp_h = [ 250, 250 ]
39 mlp_loss = "categorical_crossentropy"
40 mlp_dropouts = [0.25]* len(mlp_h)
41 mlp_sgd = Adam(lr=0.0001)
42 mlp_epochs = 3000
43 mlp_batch_size = 1
44 mlp_input_activation = "relu"
45 mlp_output_activation="softmax"
46
47 ress = []
48 for key in ["TRS", "ASR"] :
49
50 res=mlp.train_mlp(origin_corps["LDA"][key]["TRAIN"],origin_corps["LABEL"][key]["TRAIN"],
51 origin_corps["LDA"][key]["DEV"],origin_corps["LABEL"][key]["DEV"],
52 origin_corps["LDA"][key]["TEST"],origin_corps["LABEL"][key]["TEST"],
53 mlp_h,dropouts=mlp_dropouts,sgd=mlp_sgd,
54 epochs=mlp_epochs,
55 batch_size=mlp_batch_size,
56 save_pred=False,keep_histo=False,
57 loss="categorical_crossentropy",fit_verbose=0)
58 arg_best=[]
59 dev_best=[]
60 arg_best.append(numpy.argmax(res[1]))
61 dev_best.append(res[1][arg_best[-1]])
62 res[1][arg_best[-1]]=0
63 arg_best.append(numpy.argmax(res[1]))
64 dev_best.append(res[1][arg_best[-1]])
65 res[1][arg_best[-1]]=0
66 arg_best.append(numpy.argmax(res[1]))
67 dev_best.append(res[1][arg_best[-1]])
68 res[1][arg_best[-1]]=0
69 arg_best.append(numpy.argmax(res[1]))
70 dev_best.append(res[1][arg_best[-1]])
71 res[1][arg_best[-1]]=0
72 arg_best.append(numpy.argmax(res[1]))
73 dev_best.append(res[1][arg_best[-1]])
74 res[1][arg_best[-1]]=0
75 arg_best.append(numpy.argmax(res[1]))
76 dev_best.append(res[1][arg_best[-1]])
77 res[1][arg_best[-1]]=0
78 arg_best.append(numpy.argmax(res[1]))
79 dev_best.append(res[1][arg_best[-1]])
80 res[1][arg_best[-1]]=0
81 arg_best.append(numpy.argmax(res[1]))
82 dev_best.append(res[1][arg_best[-1]])
83 res[1][arg_best[-1]]=0
84 arg_best.append(numpy.argmax(res[1]))
85 dev_best.append(res[1][arg_best[-1]])
86 res[1][arg_best[-1]]=0
87 arg_best.append(numpy.argmax(res[1]))
88 dev_best.append(res[1][arg_best[-1]])
89 res[1][arg_best[-1]]=0
90 arg_best.append(numpy.argmax(res[1]))
91 dev_best.append(res[1][arg_best[-1]])
92 res[1][arg_best[-1]]=0
93 arg_best.append(numpy.argmax(res[1]))
94 dev_best.append(res[1][arg_best[-1]])
95 res[1][arg_best[-1]]=0
96
97
98
99
100 test_best =[ res[2][x] for x in arg_best ]
101 test_max = numpy.max(res[2])
102 out_db[key]=(res,(dev_best,test_best,test_max))
103 ress.append((key,dev_best,test_best,test_max))
104
105 for el in ress :
106 print el
107 out_db.close()
108 origin_corps.close()
109
LDA/04b-mmf_mini_ae.py
File was created 1
2 # coding: utf-8
3
4 # In[2]:
5
6 # Import
7 import gensim
8 from scipy import sparse
9 import itertools
10 from sklearn import preprocessing
11 from keras.models import Sequential
12 from keras.optimizers import SGD,Adam
13 from mlp import *
14 import sklearn.metrics
15 import shelve
16 import pickle
17 from utils import *
18 import sys
19 import os
20 import json
21 # In[4]:
22
23 infer_model=shelve.open("{}".format(sys.argv[2]))
24 in_dir = sys.argv[1]
25 #['ASR', 'TRS', 'LABEL']
26 # In[6]:
27
28
29 hidden_size=[ 100 , 50, 100 ]
30 input_activation="tanh"
31 output_activation="tanh"
32 loss="mse"
33 epochs=1000
34 batch=1
35 patience=60
36 do_do=[False]
37 sgd = Adam(lr=0.000001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True)
38
39
40
41 mlp_h = [ 150 ,150 ,150 ]
42 mlp_loss = "categorical_crossentropy"
43 mlp_dropouts = []
44 mlp_sgd = Adam(lr=0.0001)
45 mlp_epochs = 2000
46 mlp_batch_size = 8
47 mlp_output_activation="softmax"
48
49 try :
50 sgd_repr=sgd.get_config()["name"]
51 except AttributeError :
52 sgd_repr=sgd
53
54 try :
55 mlp_sgd_repr=mlp_sgd.get_config()["name"]
56 except AttributeError :
57 mlp_sgd_repr=mlp_sgd
58
59
60 params={ "h1" : "_".join([ str(x) for x in hidden_size ]),
61 "inside_activation" : input_activation,
62 "output_activation" : output_activation,
63 "do_dropout": "_".join([str(x) for x in do_do]),
64 "loss" : loss,
65 "epochs" : epochs ,
66 "batch_size" : batch,
67 "patience" : patience,
68 "sgd" : sgd_repr,
69 "mlp_h ": "_".join([str(x) for x in mlp_h]),
70 "mlp_loss ": mlp_loss,
71 "mlp_dropouts ": "_".join([str(x) for x in mlp_dropouts]),
72 "mlp_sgd ": mlp_sgd_repr,
73 "mlp_epochs ": mlp_epochs,
74 "mlp_batch_size ": mlp_batch_size,
75 "mlp_output" : mlp_output_activation
76 }
77 name = "_".join([ str(x) for x in params.values()])
78 try:
79 os.mkdir("{}/{}".format(in_dir,name))
80 except:
81 pass
82 db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True)
83 db["params"] = params
84 db["LABEL"]=infer_model["LABEL"]
85 #
86 json.dump(params,
87 open("{}/{}/ae_model.json".format(in_dir,name),"w"),
88 indent=4)
89
90 keys = ["ASR","TRS"]
91
92 db["AE"] = {}
93 db["LDA"] = {}
94 for mod in keys :
95 print mod
96 db["LDA"][mod] = train_mlp(infer_model["LDA"][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"],
97 infer_model["LDA"][mod]["DEV"],infer_model["LABEL"][mod]["DEV"],
98 infer_model["LDA"][mod]["TEST"],infer_model["LABEL"][mod]["TEST"],
99 mlp_h ,sgd=mlp_sgd,
100 epochs=mlp_epochs,
101 batch_size=mlp_batch_size,
102 input_activation=input_activation,
103 output_activation=mlp_output_activation,
104 dropouts=mlp_dropouts,
105 fit_verbose=0)
106
107 res=train_ae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"],
108 hidden_size,patience = params["patience"],sgd=sgd,
109 dropouts=do_do,input_activation=input_activation,output_activation=output_activation,
110 loss=loss,epochs=epochs,batch_size=batch,verbose=0)
111 mlp_res_list=[]
112 for layer in res :
113 mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"],
114 layer[1],infer_model["LABEL"][mod]["DEV"],
115 layer[2],infer_model["LABEL"][mod]["TEST"],
116 mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,
117 output_activation=mlp_output_activation,
118 input_activation=input_activation,
119 batch_size=mlp_batch_size,fit_verbose=0))
120 db["AE"][mod]=mlp_res_list
121
122 mod = "ASR"
123 mod2= "TRS"
124 mlp_res_list=[]
125
126 res = train_ae(infer_model["LDA"][mod]["TRAIN"],
127 infer_model["LDA"][mod]["DEV"],
128 infer_model["LDA"][mod]["TEST"],
129 hidden_size,dropouts=do_do,patience = params["patience"],
130 sgd=sgd,input_activation=input_activation,output_activation=output_activation,loss=loss,epochs=epochs,
131 batch_size=batch,
132 y_train=infer_model["LDA"][mod]["TRAIN"],
133 y_dev=infer_model["LDA"][mod2]["DEV"],
134 y_test=infer_model["LDA"][mod2]["TEST"])
135
136 for layer in res :
137 mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"],
138 layer[1],infer_model["LABEL"][mod]["DEV"],
139 layer[2],infer_model["LABEL"][mod]["TEST"],
140 mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,
141 output_activation=mlp_output_activation,
142 input_activation=input_activation,
143 batch_size=mlp_batch_size,fit_verbose=0))
144
145 db["AE"]["SPE"] = mlp_res_list
146
147 db.sync()
148 db.close()
149
File was created 1
2 # coding: utf-8
3
4 # In[2]:
5
6 # Import
7 import gensim
8 from scipy import sparse
9 import itertools
10 from sklearn import preprocessing
11 from keras.models import Sequential
12 from keras.optimizers import SGD,Adam
13 from mlp import *
14 import mlp
15 import sklearn.metrics
16 import shelve
17 import pickle
18 from utils import *
19 import sys
20 import os
21 import json
22 # In[4]:
23
24 infer_model=shelve.open("{}".format(sys.argv[2]))
25 in_dir = sys.argv[1]
26 #['ASR', 'TRS', 'LABEL']
27 # In[6]:
28
29
30 hidden_size=[ 100, 80, 50 , 20 ]
31 input_activation="relu"
32 output_activation="relu"
33 loss="mse"
34 epochs=3000
35 batch=1
36 patience=20
37 do_do=[ 0 ] * len(hidden_size)
38 sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True)
39 try :
40 sgd_repr=sgd.get_config()["name"]
41 except AttributeError :
42 sgd_repr=sgd
43
44 params={ "h1" : "_".join([str(x) for x in hidden_size]),
45 "inside_activation" : input_activation,
46 "out_activation" : output_activation,
47 "do_dropout": "_".join([str(x) for x in do_do]),
48 "loss" : loss,
49 "epochs" : epochs ,
50 "batch_size" : batch,
51 "patience" : patience,
52 "sgd" : sgd_repr}
53 name = "_".join([ str(x) for x in params.values()])
54 try:
55 os.mkdir("{}/SAE_{}".format(in_dir,name))
56 except:
57 pass
58 db = shelve.open("{}/SAE_{}/ae_model.shelve".format(in_dir,name),writeback=True)
59 #
60 json.dump(params,
61 open("{}/SAE_{}/ae_model.json".format(in_dir,name),"w"),
62 indent=4)
63
64 keys = ["ASR","TRS"]
65
66 mlp_h = [ 150 , 300 ]
67 mlp_loss ="categorical_crossentropy"
68 mlp_dropouts = [0,0,0,0]
69 mlp_sgd = Adam(0.001)
70 mlp_epochs = 2000
71 mlp_batch_size = 8
72
73 db["SAE"] = {}
74
75 db["SAEFT"] = {}
76 for mod in keys :
77 print "MODE ", mod
78 res_tuple=train_sae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],
79 infer_model["LDA"][mod]["TEST"],
80 hidden_size,dropouts=do_do,
81 patience = params["patience"],sgd=sgd,input_activation="tanh",
82 output_activation="tanh",loss=loss,epochs=epochs,
83 batch_size=batch,verbose=0)
84 #print len(res), [len(x) for x in res[0]], [ len(x) for x in res[1]]
85 for name , levels in zip(["SAE","SAEFT"],res_tuple):
86 print "NAME", name
87 mlp_res_by_level = []
88 for res in levels:
89 mlp_res_list=[]
90 for nb,layer in enumerate(res) :
91 print "layer NB",nb
92 mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"],
93 layer[1],infer_model["LABEL"][mod]["DEV"],
94 layer[2],infer_model["LABEL"][mod]["TEST"],
95 mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,
96 sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size,
97 fit_verbose=0))
98 mlp_res_by_level.append(mlp_res_list)
99 db[name][mod]=mlp_res_by_level
100
101 mod = "ASR"
102 mod2= "TRS"
103 print "mode SPE "
104 res_tuple = train_sae(infer_model["LDA"][mod]["TRAIN"],
105 infer_model["LDA"][mod]["DEV"],
106 infer_model["LDA"][mod]["TEST"],
107 hidden_size,dropouts=[0],patience=params["patience"],
108 sgd=sgd,input_activation=input_activation,output_activation=input_activation,
109 loss=loss,epochs=epochs,batch_size=batch,
110 y_train=infer_model["LDA"][mod2]["TRAIN"],
111 y_dev=infer_model["LDA"][mod2]["DEV"],
112 y_test=infer_model["LDA"][mod2]["TEST"])
113
114 for name , levels in zip(["SAE","SAEFT"],res_tuple):
115 mlp_res_by_level = []
116 for res in levels :
117 mlp_res_list=[]
118 for layer in res :
119 mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"],
120 layer[1],infer_model["LABEL"][mod]["DEV"],layer[2],
121 infer_model["LABEL"][mod]["TEST"],
122 mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,
123 sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size,
124 fit_verbose=0))
125 mlp_res_by_level.append(mlp_res_list)
126 db[name]["SPE"] = mlp_res_by_level
127
128 db.close()
129
File was created 1
2 # coding: utf-8
3
4 # In[2]:
5
6 # Import
7 import gensim
8 from scipy import sparse
9 import itertools
10 from sklearn import preprocessing
11 from keras.models import Sequential
12 from keras.optimizers import SGD,Adam
13 from mlp import *
14 import mlp
15 import sklearn.metrics
16 import shelve
17 import pickle
18 from utils import *
19 import sys
20 import os
21 import json
22 # In[4]:
23
24 infer_model=shelve.open("{}".format(sys.argv[2]))
25 in_dir = sys.argv[1]
26 #['ASR', 'TRS', 'LABEL']
27 # In[6]:
28
29 # AE params
30 hidden_size=[ 100, 100 ]
31 input_activation="relu"
32 output_activation="relu"
33 loss="mse"
34 epochs= 1000
35 batch_size=1
36 patience=20
37 do_do=[ 0.25 ] * len(hidden_size)
38 sgd = Adam(lr=0.00001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True)
39 try :
40 sgd_repr=sgd.get_config()["name"]
41 except AttributeError :
42 sgd_repr=sgd
43
44 # Transforme :
45 trans_hidden_size=[ 300 , 300 ]
46 trans_input_activation="relu"
47 trans_output_activation="relu"
48 trans_loss="mse"
49 trans_epochs=1000
50 trans_batch_size=8
51 trans_patience=20
52 trans_do=[ 0.25 ] * len(trans_hidden_size)
53 trans_sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True)
54 try :
55 trans_sgd_repr=trans_sgd.get_config()["name"]
56 except AttributeError :
57 trans_sgd_repr=trans_sgd
58
59
60
61 ae={ "h1" : "_".join([str(x) for x in hidden_size]),
62 "inside_activation" : input_activation,
63 "out_activation" : output_activation,
64 "do_dropout": "_".join([str(x) for x in do_do]),
65 "loss" : loss,
66 "epochs" : epochs ,
67 "batch_size" : batch_size,
68 "patience" : patience,
69 "sgd" : sgd_repr}
70 name = "_".join([ str(x) for x in ae.values()])
71
72 trans={ "h1" : "_".join([str(x) for x in trans_hidden_size]),
73 "inside_activation" : trans_input_activation,
74 "out_activation" : trans_output_activation,
75 "do_dropout": "_".join([str(x) for x in trans_do]),
76 "loss" : trans_loss,
77 "epochs" : trans_epochs ,
78 "batch_size" : trans_batch_size,
79 "patience" : trans_patience,
80 "sgd" : trans_sgd_repr}
81
82 mlp_h = [ 300 , 300 ]
83 mlp_loss ="categorical_crossentropy"
84 mlp_dropouts = [0,0,0,0]
85 mlp_sgd = Adam(0.0001)
86 mlp_epochs = 1000
87 mlp_batch_size = 8
88 mlp_input_activation = "relu"
89 mlp_output_activation = "softmax"
90
91 try :
92 mlp_sgd_repr=mlp_sgd.get_config()["name"]
93 except AttributeError :
94 mlp_sgd_repr=mlp_sgd
95
96
97
98 mlp={ "h1" : "_".join([str(x) for x in mlp_h ]),
99 "inside_activation" : mlp_input_activation,
100 "out_activation" : mlp_output_activation,
101 "do_dropout": "_".join([str(x) for x in mlp_dropouts]),
102 "loss" : mlp_loss,
103 "epochs" : mlp_epochs ,
104 "batch_size" : mlp_batch_size,
105 "sgd" : mlp_sgd_repr}
106
107 params = { "ae":ae, "trans":trans, "mlp":mlp}
108 try:
109 os.mkdir("{}/DSAE_{}".format(in_dir,name))
110 except:
111 pass
112 db = shelve.open("{}/DSAE_{}/ae_model.shelve".format(in_dir,name),writeback=True)
113 #
114 json.dump(params,
115 open("{}/DSAE_{}/ae_model.json".format(in_dir,name),"w"),
116 indent=4)
117
118 keys = ["ASR","TRS"]
119
120
121
122 db["DSAE"] = {}
123
124 db["DSAEFT"] = {}
125 mod = "ASR"
126 res_tuple_ASR = train_ae(infer_model["LDA"][mod]["TRAIN"],
127 infer_model["LDA"][mod]["DEV"],
128 infer_model["LDA"][mod]["TEST"],
129 hidden_size,dropouts=do_do,
130 patience = patience,sgd=sgd,
131 input_activation=input_activation,
132 output_activation=output_activation,loss=loss,epochs=epochs,
133 batch_size=batch_size,verbose=0,get_weights=True)
134 mlp_res_list = []
135 for layer in res_tuple_ASR[0]:
136 mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"],
137 layer[1],infer_model["LABEL"][mod]["DEV"],
138 layer[2],infer_model["LABEL"][mod]["TEST"],
139 mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,
140 sgd=mlp_sgd,epochs=mlp_epochs,
141 output_activation=mlp_output_activation,
142 input_activation=mlp_input_activation,
143 batch_size=mlp_batch_size,fit_verbose=0))
144
145 db["DSAE"][mod] = mlp_res_list
146 mod = "TRS"
147 print hidden_size
148 res_tuple_TRS = train_ae(infer_model["LDA"][mod]["TRAIN"],
149 infer_model["LDA"][mod]["DEV"],
150 infer_model["LDA"][mod]["TEST"],
151 hidden_size,dropouts=do_do,
152 sgd=sgd,input_activation=input_activation,
153 output_activation=output_activation,loss=loss,epochs=epochs,
154 batch_size=batch_size,patience=patience,
155 verbose=0,get_weights=True)
156
157 mlp_res_list = []
158 for layer in res_tuple_TRS[0]:
159 mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"],
160 layer[1],infer_model["LABEL"][mod]["DEV"],
161 layer[2],infer_model["LABEL"][mod]["TEST"],
162 mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,
163 sgd=mlp_sgd,epochs=mlp_epochs,
164 output_activation=mlp_output_activation,
165 input_activation=mlp_input_activation,
166 batch_size=mlp_batch_size,fit_verbose=0))
167
168 db["DSAE"][mod] = mlp_res_list
169
170
171
172 transfert = []
173
174 print " get weight trans"
175
176 for asr_pred, trs_pred in zip(res_tuple_ASR[0], res_tuple_TRS[0]):
177 print "ASR", [ x.shape for x in asr_pred]
178
179 print "TRS", [ x.shape for x in trs_pred]
180 print
181
182 for asr_pred, trs_pred in zip(res_tuple_ASR[0], res_tuple_TRS[0]):
183 print "ASR", [ x.shape for x in asr_pred]
184
185 print "TRS", [ x.shape for x in trs_pred]
186 transfert.append( train_ae(asr_pred[0],
187 asr_pred[1],
188 asr_pred[2],
189 trans_hidden_size,
190 dropouts=trans_do,
191 y_train = trs_pred[0],
192 y_dev=trs_pred[1],
193 y_test = trs_pred[2],
194 patience = trans_patience,sgd=trans_sgd,
195 input_activation=trans_input_activation,
196 output_activation=trans_output_activation,
197 loss=trans_loss,
198 epochs=trans_epochs,
199 batch_size=trans_batch_size,verbose=0,get_weights=True) )
200 mod = "ASR"
201 mlp_res_bylvl = []
202 print " MLP on transfert "
203 for level, w in transfert :
204 mlp_res_list = []
205 for layer in level :
206 mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"],
207 layer[1],infer_model["LABEL"][mod]["DEV"],
208 layer[2],infer_model["LABEL"][mod]["TEST"],
209 mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,
210 sgd=mlp_sgd,epochs=mlp_epochs,
211 output_activation=mlp_output_activation,
212 input_activation=mlp_input_activation,
213 batch_size=mlp_batch_size,fit_verbose=0))
214 mlp_res_bylvl.append(mlp_res_list)
215 db["DSAE"]["transfert"] = mlp_res_bylvl
216
217
218 print " FT "
219 WA = res_tuple_ASR[1]
220 print "WA", len(WA), [ len(x) for x in WA]
221 WT = res_tuple_TRS[1]
222
223 print "WT", len(WT), [ len(x) for x in WT]
224 Wtr = [ x[1] for x in transfert]
225
226 print "Wtr", len(Wtr), [ len(x) for x in Wtr],[ len(x[1]) for x in Wtr]
227
228 ft_res = ft_dsae(infer_model["LDA"]["ASR"]["TRAIN"],
229 infer_model["LDA"]["ASR"]["DEV"],
230 infer_model["LDA"]["ASR"]["TEST"],
231 y_train=infer_model["LDA"]["TRS"]["TRAIN"],
232 y_dev=infer_model["LDA"]["TRS"]["DEV"],
233 y_test=infer_model["LDA"]["TRS"]["TEST"],
234 ae_hidden = hidden_size,
235 transfer_hidden = trans_hidden_size,
236 start_weights = WA,
237 transfer_weights = Wtr,
238 end_weights = WT,
239 input_activation = input_activation,
240 output_activation = output_activation,
241 ae_dropouts= do_do,
242 transfer_do = trans_do,
243 sgd = sgd,
244 loss = loss ,
245 patience = patience,
246 batch_size = batch_size,
247 epochs= epochs)
248 mlps_by_lvls= []
249 for level in ft_res :
250 mlp_res_list = []
251 for layer in level :
252 mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"],
253 layer[1],infer_model["LABEL"][mod]["DEV"],
254 layer[2],infer_model["LABEL"][mod]["TEST"],
255 mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,
256 sgd=mlp_sgd,epochs=mlp_epochs,
257 output_activation=mlp_output_activation,
258 input_activation=mlp_input_activation,
259 batch_size=mlp_batch_size,fit_verbose=0))
260 mlps_by_lvls.append(mlp_res_list)
261
262
263 db["DSAEFT"]["transfert"] = mlps_by_lvls
264
265 db.close()
266
File was created 1
2 # coding: utf-8
3
4 # In[2]:
5
6 # Import
7 import gensim
8 from scipy import sparse
9 import itertools
10 from sklearn import preprocessing
11 from keras.models import Sequential
12 from keras.optimizers import SGD,Adam
13 from mlp import *
14 from vae import *
15 import sklearn.metrics
16 import shelve
17 import pickle
18 from utils import *
19 import sys
20 import os
21 import json
22 # In[4]:
23
24 infer_model=shelve.open("{}".format(sys.argv[2]))
25 in_dir = sys.argv[1]
26 #['ASR', 'TRS', 'LABEL']
27 # In[6]:
28
29
30 hidden_size= [60]
31 input_activation="tanh"
32 output_activation="sigmoid"
33 epochs=300
34 batch=1
35 patience=60
36 sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True)
37 latent_dim = 30
38
39
40
41 mlp_h = [ 256 ]
42 mlp_loss = "categorical_crossentropy"
43 mlp_dropouts = []
44 mlp_sgd = Adam(lr=0.001)
45 mlp_epochs = 1000
46 mlp_batch_size = 16
47 mlp_output_activation="softmax"
48
49 try :
50 sgd_repr=sgd.get_config()["name"]
51 except AttributeError :
52 sgd_repr=sgd
53
54 try :
55 mlp_sgd_repr=mlp_sgd.get_config()["name"]
56 except AttributeError :
57 mlp_sgd_repr=mlp_sgd
58
59
60 params={ "h1" : "_".join([ str(x) for x in hidden_size ]),
61 "inside_activation" : input_activation,
62 "output_activation" : output_activation,
63 "epochs" : epochs ,
64 "batch_size" : batch,
65 "patience" : patience,
66 "sgd" : sgd_repr,
67 "mlp_h ": "_".join([str(x) for x in mlp_h]),
68 "mlp_loss ": mlp_loss,
69 "mlp_dropouts ": "_".join([str(x) for x in mlp_dropouts]),
70 "mlp_sgd ": mlp_sgd_repr,
71 "mlp_epochs ": mlp_epochs,
72 "mlp_batch_size ": mlp_batch_size,
73 "mlp_output" : mlp_output_activation
74 }
75 name = "_".join([ str(x) for x in params.values()])
76 try:
77 os.mkdir("{}/VAE_{}".format(in_dir,name))
78 except:
79 pass
80 db = shelve.open("{}/VAE_{}/ae_model.shelve".format(in_dir,name),writeback=True)
81 db["params"] = params
82 db["LABEL"]=infer_model["LABEL"]
83 #
84 json.dump(params,
85 open("{}/VAE_{}/ae_model.json".format(in_dir,name),"w"),
86 indent=4)
87
88 keys = ["ASR","TRS"]
89
90 db["VAE"] = {}
91 db["LDA"] = {}
92 for mod in keys :
93 print mod
94 db["LDA"][mod] = train_mlp(infer_model["LDA"][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"],
95 infer_model["LDA"][mod]["DEV"],infer_model["LABEL"][mod]["DEV"],
96 infer_model["LDA"][mod]["TEST"],infer_model["LABEL"][mod]["TEST"],
97 mlp_h ,sgd=mlp_sgd,
98 epochs=mlp_epochs,
99 batch_size=mlp_batch_size,
100 input_activation=input_activation,
101 output_activation=mlp_output_activation,
102 dropouts=mlp_dropouts,
103 fit_verbose=0)
104
105 res=train_vae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"],
106 hidden_size=hidden_size[0],
107 latent_dim=latent_dim,sgd=sgd,
108 input_activation=input_activation,output_activation=output_activation,
109 nb_epochs=epochs,batch_size=batch)
110 mlp_res_list=[]
111 for layer in res :
112 mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"],
113 layer[1],infer_model["LABEL"][mod]["DEV"],
114 layer[2],infer_model["LABEL"][mod]["TEST"],
115 mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,
116 output_activation=mlp_output_activation,
117 input_activation=input_activation,
118 batch_size=mlp_batch_size,fit_verbose=0))
119 db["VAE"][mod]=mlp_res_list
120
121 mod = "ASR"
122 mod2= "TRS"
123 mlp_res_list=[]
124
125 res = train_vae(infer_model["LDA"][mod]["TRAIN"],
126 infer_model["LDA"][mod]["DEV"],
127 infer_model["LDA"][mod]["TEST"],
128 hidden_size=hidden_size[0],
129 sgd=sgd,input_activation=input_activation,output_activation=output_activation,
130 latent_dim=latent_dim,
131 nb_epochs=epochs,
132 batch_size=batch,
133 y_train=infer_model["LDA"][mod2]["TRAIN"],
134 y_dev=infer_model["LDA"][mod2]["DEV"],
135 y_test=infer_model["LDA"][mod2]["TEST"])
136
137 for layer in res :
138 mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"],
139 layer[1],infer_model["LABEL"][mod]["DEV"],
140 layer[2],infer_model["LABEL"][mod]["TEST"],
141 mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,
142 output_activation=mlp_output_activation,
143 input_activation=input_activation,
144 batch_size=mlp_batch_size,fit_verbose=0))
145
146 db["VAE"]["SPE"] = mlp_res_list
147
148 db.sync()
149 db.close()
150
LDA/05-mmf_getscore.py
File was created 1 import numpy as np
2 import shelve
3 import sys
4 import glob
5 from collections import defaultdict
6 from tinydb import TinyDB, Query
7 from mako.template import Template
8 import time
9
10 def get_best(x):
11 argbest=np.argmax(x[1])
12 maxdev=x[1][argbest]
13 maxtrain=np.max(x[0])
14 maxtest=np.max(x[2])
15 besttest=x[2][argbest]
16 return ( maxtrain,maxdev,maxtest,besttest)
17 depth = lambda L: isinstance(L, list) and max(map(depth, L))+1
18
19
20 template_name = '''
21 ${name}
22 ========================
23
24 MLP scores :
25 -------------------
26 '''
27 template_value='''\n\n
28 | ${model} ${ttype} | train | dev |max test| best test|
29 | -------------------:|:--------:|:---------:|:------:|:--------:|
30 % for cpt,line in enumerate(models[model][ttype]):
31 | ${cpt} | ${line[0]} | ${line[1]} |${line[2]} | ${line[3]} |
32 % endfor
33 \n
34 '''
35
36 # ae_model.shelve
37 def get_folder_file(x):
38 folder=x.split("/")[1]
39 shelve_file = ".".join(x.split(".")[:-1])
40 return(folder,shelve_file)
41
42 in_folder = sys.argv[1]
43
44
45 models = defaultdict(dict)
46
47 ae_model_list = glob.glob("{}/*/ae_model.shelve.dir".format(in_folder))
48 ae_model_list = sorted(ae_model_list)
49 ae_model_list= map(get_folder_file,ae_model_list)
50 for name , shelve_file in ae_model_list :
51 print Template(template_name).render(name=name)
52 opened_shelve = shelve.open(shelve_file)
53 keys = opened_shelve.keys()
54 if "LABEL" in keys :
55 keys.remove("LABEL")
56 if "params" in keys:
57 keys.remove("params")
58 to_print = []
59 for working_key in keys:
60 for key in opened_shelve[working_key].keys():
61 table_depth = depth(opened_shelve[working_key][key])
62 if table_depth == 3 :
63 models[working_key][key] = [ get_best(x) for x in opened_shelve[working_key][key] ]
64 to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip())
65 elif table_depth == 2 :
66 models[working_key][key] = [ get_best(opened_shelve[working_key][key]) ]
67 to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip())
68 elif table_depth == 4 :
69 for layer in opened_shelve[working_key][key] :
70 models[working_key][key] = [ get_best(x) for x in layer ]
71 to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip())
72 print "\n".join(to_print)
73
74
1 #python 00-prepross.py 1 #python 00-prepross.py
2 python 02-lda-order.py DECODA_list_wid.shelve output_v5/perplex.db 50 10 output_v5 50_500 0 1_0.1 1_0.1 500_1000 100_2000 2 python 02-lda.py DECODA_list_wid.shelve output_v5/perplex.db 50 10 output_v5 50_500 0 1_0.1 1_0.1 500_1000 100_2000
3 #python 03-perplex.py DECODA_list_wid.shelve output_v5 output_v5/perplex.db 3 #python 03-perplex.py DECODA_list_wid.shelve output_v5 output_v5/perplex.db
4 python 03-order_by_perp.py output_v5/perplex.db output_v5 4 python 03-order_by_perp.py output_v5/perplex.db output_v5
5 bash 04-run_mlp_ae.sh output_v5 DECODA_list_wid.shelve 5 bash 04-run_mlp_ae.sh output_v5 DECODA_list_wid.shelve
6 python 05-getscore.py output_v5 > res.mkd 6 python 05-getscore.py output_v5 > res.mkd
7 notedown res.mkd >res_v5.ipynb 7 notedown res.mkd >res_v5.ipynb
8 8
1 # -*- coding: utf-8 -*- 1 # -*- coding: utf-8 -*-
2 import nltk 2 import nltk
3 import re 3 import re
4 pattern = ur"\d+(?:\.\d+)?\s*%?|\w{1,2}'|<unk>|[\wéàèùêôûâòìîç]+|[^\w\s]" 4 pattern = ur"\d+(?:\.\d+)?\s*%?|\w{1,2}'|<unk>|[\wéàèùêôûâòìîç]+|[^\w\s]"
5 rer_b = re.compile(ur" r e r(?: e r)? b ") 5 rer_b = re.compile(ur" r e r(?: e r)? b ")
6 rer_c = re.compile(ur" r e r(?: e r)? c |r e r( e r)? c' est | rer c' est") 6 rer_c = re.compile(ur" r e r(?: e r)? c |r e r( e r)? c' est | rer c' est")
7 rer = re.compile(ur" (e )?r e r(?: e r)? |re r( e r)? |rer e r | r e rer | r e r | r e rer |r( e r)+ ") 7 rer = re.compile(ur" (e )?r e r(?: e r)? |re r( e r)? |rer e r | r e rer | r e r | r e rer |r( e r)+ ")
8 sncf = re.compile(ur" s n c f ") 8 sncf = re.compile(ur" s n c f ")
9 jusq = re.compile(ur" jusqu ' ") 9 jusq = re.compile(ur" jusqu ' ")
10 ratp = re.compile(ur" r a t(?: p)? ") 10 ratp = re.compile(ur" r a t(?: p)? ")
11 quel = re.compile(ur" quelqu ' ") 11 quel = re.compile(ur" quelqu ' ")
12 space = re.compile(ur" +") 12 space = re.compile(ur" +")
13 tok2 = nltk.RegexpTokenizer(pattern,flags=re.UNICODE ) 13 tok2 = nltk.RegexpTokenizer(pattern,flags=re.UNICODE )
14 # (?x)\d+(\.\d+)?\s*%| \w'| \w+| [^\w\s] 14 # (?x)\d+(\.\d+)?\s*%| \w'| \w+| [^\w\s]
15 15
16 def preproc(line): 16 def preproc(line):
17 # print 1,line.encode('utf8') 17 # print 1,line.encode('utf8')
18 line = space.subn(u" ",line)[0] 18 line = space.subn(u" ",line)[0]
19 line = rer_b.subn(u" rer b ",line)[0] 19 line = rer_b.subn(u" rer b ",line)[0]
20 line = rer_c.subn(u" rer c ",line)[0] 20 line = rer_c.subn(u" rer c ",line)[0]
21 line = rer.subn(u" rer ",line)[0] 21 line = rer.subn(u" rer ",line)[0]
22 line = sncf.subn(u" sncf ",line)[0] 22 line = sncf.subn(u" sncf ",line)[0]
23 line = ratp.subn(u" ratp ",line)[0] 23 line = ratp.subn(u" ratp ",line)[0]
24 line = jusq.subn(u" jusqu' ",line)[0] 24 line = jusq.subn(u" jusqu' ",line)[0]
25 line = quel.subn(u" quelqu' ",line)[0] 25 line = quel.subn(u" quelqu' ",line)[0]
26 line = space.subn(u" ",line)[0] 26 line = space.subn(u" ",line)[0]
27 # print 2,line.encode('utf8') 27 # print 2,line.encode('utf8')
28 return line.lower() 28 return line.lower()
29 29
30 def yield_corpus(df_list): 30 def yield_corpus(df_list):
31 for corpus in df_list: 31 for corpus in df_list:
32 for id,doc in corpus.iterrows(): 32 for id,doc in corpus.iterrows():
33 try: 33 try:
34 a = tok2.tokenize(preproc(doc[2].decode("utf-8"))) 34 a = tok2.tokenize(preproc(doc[2].decode("utf-8")))
35 # print 3, " ".join(a).encode("utf8") 35 # print 3, " ".join(a).encode("utf8")
36 yield a 36 yield a
37 except: 37 except:
38 print doc[2] 38 print doc[2]
39 raise 39 raise
40 def select(elm): 40 def select(elm):
41 return int(elm.split("_")[-1]) 41 return int(elm.split("_")[-1])
42
43
44 def select_mmf(elm):
45 return int(elm.split("_")[0])
42 46
File was created 1 '''This script demonstrates how to build a variational autoencoder with Keras.
2 Reference: "Auto-Encoding Variational Bayes" https://arxiv.org/abs/1312.6114
3 '''
4
5 import itertools
6 import sys
7 import json
8
9 import numpy as np
10 import matplotlib.pyplot as plt
11 from scipy import sparse
12 import scipy.io
13
14 from keras.layers import Input, Dense, Lambda
15 from keras.models import Model
16 from keras import backend as K
17 from keras import objectives
18 from keras.datasets import mnist
19
20 import pandas
21 import shelve
22 import pickle
23
24
25
26
27
28 #batch_size = 16
29 #original_dim = 784
30 #latent_dim = 2
31 #intermediate_dim = 128
32 #epsilon_std = 0.01
33 #nb_epoch = 40
34
35
36
37
38 def train_vae(x_train,x_dev,x_test,y_train=None,y_dev=None,y_test=None,hidden_size=80,latent_dim=12,batch_size=8,nb_epochs=10,sgd="rmsprop",input_activation = "relu",output_activation = "sigmoid",epsilon_std=0.01):
39
40
41
42 def sampling(args):
43 z_mean, z_log_std = args
44 epsilon = K.random_normal(shape=(batch_size, latent_dim),
45 mean=0., std=epsilon_std)
46 return z_mean + K.exp(z_log_std) * epsilon
47
48 def vae_loss(x, x_decoded_mean):
49 xent_loss = objectives.binary_crossentropy(x, x_decoded_mean)
50 kl_loss = - 0.5 * K.mean(1 + z_log_std - K.square(z_mean) - K.exp(z_log_std), axis=-1)
51 return xent_loss + kl_loss
52
53 original_dim = x_train.shape[1]
54
55
56 x = Input(batch_shape=(batch_size, original_dim))
57 h = Dense(hidden_size, activation=input_activation)(x)
58 z_mean = Dense(latent_dim)(h)
59 z_log_std = Dense(latent_dim)(h)
60
61
62 # note that "output_shape" isn't necessary with the TensorFlow backend
63 # so you could write `Lambda(sampling)([z_mean, z_log_std])`
64 z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_std])
65
66 # we instantiate these layers separately so as to reuse them later
67 decoder_h = Dense(hidden_size, activation=input_activation)
68 decoder_mean = Dense(original_dim, activation=output_activation)
69 h_decoded = decoder_h(z)
70 x_decoded_mean = decoder_mean(h_decoded)
71
72
73 vae = Model(x, x_decoded_mean)
74 vae.compile(optimizer=sgd, loss=vae_loss)
75
76 # train the VAE on MNIST digits
77 if y_train is None or y_dev is None or y_test is None :
78 y_train = x_train
79 y_dev = x_dev
80 y_test = x_test
81
82 vae.fit(x_train, y_train,
83 shuffle=True,
84 nb_epoch=nb_epochs,
85 batch_size=batch_size,
86 validation_data=(x_dev, y_dev))
87
88 # build a model to project inputs on the latent space
89 encoder = Model(x, z_mean)
90 pred_train = encoder.predict(x_train, batch_size=batch_size)
91 pred_dev = encoder.predict(x_dev, batch_size=batch_size)
92 pred_test = encoder.predict(x_test,batch_size=batch_size)
93 return [ [ pred_train, pred_dev, pred_test ] ]
94 # display a 2D plot of the digit classes in the latent space
95 #x_test_encoded = encoder.predict(x_test, batch_size=batch_size)
96 # build a digit generator that can sample from the learned distribution
97 #decoder_input = Input(shape=(latent_dim,))
98 #_h_decoded = decoder_h(decoder_input)
99 #_x_decoded_mean = decoder_mean(_h_decoded)
100 #generator = Model(decoder_input, _x_decoded_mean)
101 #x_decoded = generator.predict(z_sample)
102
103