Commit 7db73861ffbab3f3f51b17188d8894a512b36264
1 parent
b6d0165d16
Exists in
master
add vae et mmf
Showing 13 changed files with 1084 additions and 44 deletions Side-by-side Diff
LDA/00-mmf_make_features.py
1 | +import sys | |
2 | +import os | |
3 | + | |
4 | +import pandas | |
5 | +import numpy | |
6 | +import shelve | |
7 | + | |
8 | +from sklearn.preprocessing import LabelBinarizer | |
9 | + | |
10 | +from utils import select_mmf as select | |
11 | + | |
12 | +input_dir = sys.argv[1] # Dossier de premire niveau contient ASR et TRS | |
13 | +level = sys.argv[2] # taille de LDA ( -5) voulu | |
14 | + | |
15 | +lb=LabelBinarizer() | |
16 | +#y_train=lb.fit_transform([utils.select(ligneid) for ligneid in origin_corps["LABEL"]["TRAIN"]]) | |
17 | + | |
18 | + | |
19 | +data = shelve.open("{}/mmf_{}.shelve".format(input_dir,level)) | |
20 | +data["LABEL"]= {"LDA":{}} | |
21 | +for mod in ["ASR", "TRS" ] | |
22 | + train = pandas.read_table("{}/{}/train_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | |
23 | + dev = pandas.read_table("{}/{}/dev_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | |
24 | + test = pandas.read_table("{}/{}/test_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | |
25 | + | |
26 | + y_train = train.iloc[:,0].apply(select) | |
27 | + y_dev = dev.iloc[:,0].apply(select) | |
28 | + y_test = test.iloc[:,0].apply(select) | |
29 | + lb.fit(y_train) | |
30 | + data["LABEL"][mod]={"TRAIN":lb.transform(y_train),"DEV":lb.transform(y_dev), "TEST": lb.transform(y_test)} | |
31 | + | |
32 | + data["LDA"][mod]={} | |
33 | + data["LDA"][mod]["TRAIN"]=train.iloc[:,1:].values | |
34 | + data["LDA"][mod]["DEV"]=dev.iloc[:,1:].values | |
35 | + data["LDA"][mod]["TEST"]=test.iloc[:,1:].values | |
36 | + | |
37 | +data.sync() | |
38 | +data.close() |
LDA/02-lda.py
... | ... | @@ -12,10 +12,11 @@ |
12 | 12 | import dill |
13 | 13 | from tinydb import TinyDB, where, Query |
14 | 14 | import time |
15 | +from joblib import Parallel, delayed | |
15 | 16 | |
16 | 17 | def calc_perp(models,train): |
17 | 18 | |
18 | - | |
19 | + | |
19 | 20 | stop_words=models[1] |
20 | 21 | name = models[0] |
21 | 22 | |
... | ... | @@ -45,7 +46,8 @@ |
45 | 46 | def train_lda(out_dir,train,size,it,sw_size,alpha,eta,passes,chunk): |
46 | 47 | name = "s{}_it{}_sw{}_a{}_e{}_p{}_c{}".format(size,it,sw_size,alpha,eta,passes,chunk) |
47 | 48 | logging.warning(name) |
48 | - if os.path.isfile(out_dir+"/"+name+".dill"): | |
49 | + deep_out_dir = out_dir+"/"+name | |
50 | + if os.path.isdir(deep_out_dir): | |
49 | 51 | logging.error(name+" already done") |
50 | 52 | return |
51 | 53 | logging.warning(name+" to be done") |
... | ... | @@ -54,7 +56,6 @@ |
54 | 56 | asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] |
55 | 57 | trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] |
56 | 58 | stop_words=set(asr_sw) | set(trs_sw) |
57 | - stop_words=[ x.strip() for x in open("french.txt").readlines() ] | |
58 | 59 | |
59 | 60 | logging.warning("TRS to be done") |
60 | 61 | |
61 | 62 | |
62 | 63 | |
63 | 64 | |
... | ... | @@ -68,19 +69,42 @@ |
68 | 69 | asr_probs = [] |
69 | 70 | for line in lda_asr.expElogbeta: |
70 | 71 | nline = line / np.sum(line) |
71 | - asr_probs.append( str(x) for x in nline) | |
72 | + asr_probs.append([ str(x) for x in nline]) | |
72 | 73 | trs_probs = [] |
73 | 74 | for line in lda_trs.expElogbeta: |
74 | 75 | nline = line / np.sum(line) |
75 | - trs_probs.append( str(x) for x in nline) | |
76 | + trs_probs.append([str(x) for x in nline]) | |
76 | 77 | |
77 | 78 | K = lda_asr.num_topics |
78 | 79 | topicWordProbMat_asr = lda_asr.print_topics(K,10) |
79 | 80 | |
80 | 81 | K = lda_trs.num_topics |
81 | 82 | topicWordProbMat_trs = lda_trs.print_topics(K,10) |
83 | + os.mkdir(deep_out_dir) | |
84 | + dill.dump([x for x in stop_words],open(deep_out_dir+"/stopwords.dill","w")) | |
85 | + lda_asr.save(deep_out_dir+"/lda_asr.model") | |
86 | + lda_trs.save(deep_out_dir+"/lda_trs.model") | |
87 | + dill.dump([x for x in asr_probs],open(deep_out_dir+"/lda_asr_probs.dill","w")) | |
88 | + dill.dump([x for x in trs_probs],open(deep_out_dir+"/lda_trs_probs.dill","w")) | |
89 | + | |
82 | 90 | return [name, stop_words, lda_asr , asr_probs , topicWordProbMat_asr, lda_trs, trs_probs, topicWordProbMat_trs] |
83 | 91 | |
92 | +def train_one(name,train,s,i,sw,a,e,p,c): | |
93 | + st=time.time() | |
94 | + logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]])) | |
95 | + models = train_lda(name,train,s,i,sw,a,e,p,c) | |
96 | + if models: | |
97 | + m = calc_perp(models,train) | |
98 | + #dill.dump(models,open("{}/{}.dill".format(name,models[0]),"wb")) | |
99 | + else : | |
100 | + m = None | |
101 | + e = time.time() | |
102 | + logging.warning("fin en : {}".format(e-st)) | |
103 | + return m | |
104 | + | |
105 | + | |
106 | + | |
107 | + | |
84 | 108 | if __name__ == "__main__": |
85 | 109 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) |
86 | 110 | |
... | ... | @@ -109,6 +133,8 @@ |
109 | 133 | db = TinyDB(db_path) |
110 | 134 | nb_model = len(passes) * len(chunk) * len(it) * len(sw_size) * len(alpha) * len(eta) * len(size) |
111 | 135 | logging.warning(" hey will train {} models ".format(nb_model)) |
136 | + | |
137 | + args_list=[] | |
112 | 138 | for p in passes: |
113 | 139 | for c in chunk: |
114 | 140 | for i in it : |
... | ... | @@ -116,13 +142,9 @@ |
116 | 142 | for a in alpha: |
117 | 143 | for e in eta: |
118 | 144 | for s in size: |
119 | - st=time.time() | |
120 | - logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]])) | |
121 | - models = train_lda(name,train,s,i,sw,a,e,p,c) | |
122 | - if models: | |
123 | - m = calc_perp(models,train) | |
124 | - dill.dump(models,open("{}/{}.dill".format(name,models[0]),"wb")) | |
125 | - db.insert(m) | |
126 | - e = time.time() | |
127 | - logging.warning("fin en : {}".format(e-st)) | |
145 | + args_list.append((name,train,s,i,sw,a,e,p,c)) | |
146 | + res_list= Parallel(n_jobs=15)(delayed(train_one)(*args) for args in args_list) | |
147 | + for m in res_list : | |
148 | + db.insert(m) | |
149 | + |
LDA/03-mono_perplex.py
... | ... | @@ -52,7 +52,7 @@ |
52 | 52 | input_dir = sys.argv[2] |
53 | 53 | db_path = sys.argv[3] |
54 | 54 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) |
55 | - folders = glob.glob("{}/*".format(input_dir)) | |
55 | + folders = glob.glob("{}/s*".format(input_dir)) | |
56 | 56 | |
57 | 57 | #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir))) |
58 | 58 | train = shelve.open(input_shelve) |
LDA/03-perplex.py
... | ... | @@ -22,40 +22,43 @@ |
22 | 22 | |
23 | 23 | |
24 | 24 | def calc_perp(params): |
25 | - in_dir,train = params | |
26 | - name = in_dir.split("/")[-1] | |
27 | - # s40_it1_sw50_a0.01_e0.1_p6_c1000 | |
25 | + try: | |
26 | + in_dir,train = params | |
27 | + name = in_dir.split("/")[-1] | |
28 | + # s40_it1_sw50_a0.01_e0.1_p6_c1000 | |
28 | 29 | |
29 | - entry = Query() | |
30 | - value=db.search(entry.name == name) | |
31 | - if len(value) > 0 : | |
32 | - logging.warning("{} already done".format(name)) | |
33 | - return | |
30 | + entry = Query() | |
31 | + value=db.search(entry.name == name) | |
32 | + if len(value) > 0 : | |
33 | + logging.warning("{} already done".format(name)) | |
34 | + return | |
34 | 35 | |
35 | - sw_size = int(name.split("_")[2][2:]) | |
36 | + sw_size = int(name.split("_")[2][2:]) | |
36 | 37 | |
37 | - logging.warning(" go {} ".format(name)) | |
38 | + logging.warning(" go {} ".format(name)) | |
38 | 39 | |
39 | 40 | |
40 | - logging.warning("Redo Vocab and stop") | |
41 | - asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) | |
42 | - trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) | |
43 | - asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] | |
44 | - trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] | |
45 | - stop_words=set(asr_sw) | set(trs_sw) | |
41 | + logging.warning("Redo Vocab and stop") | |
42 | + asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) | |
43 | + trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) | |
44 | + asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] | |
45 | + trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] | |
46 | + stop_words=set(asr_sw) | set(trs_sw) | |
46 | 47 | |
47 | - logging.warning("TRS to be done") | |
48 | - | |
49 | - dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] | |
50 | - lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir)) | |
51 | - perp_trs = lda_trs.log_perplexity(dev_trs) | |
52 | - logging.warning("ASR to be done") | |
53 | - dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] | |
54 | - lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir)) | |
55 | - perp_asr = lda_asr.log_perplexity(dev_asr) | |
56 | - logging.warning("ASR saving") | |
57 | - res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs} | |
58 | - return res_dict | |
48 | + logging.warning("TRS to be done") | |
49 | + | |
50 | + dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] | |
51 | + lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir)) | |
52 | + perp_trs = lda_trs.log_perplexity(dev_trs) | |
53 | + logging.warning("ASR to be done") | |
54 | + dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] | |
55 | + lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir)) | |
56 | + perp_asr = lda_asr.log_perplexity(dev_asr) | |
57 | + logging.warning("ASR saving") | |
58 | + res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs} | |
59 | + return res_dict | |
60 | + except : | |
61 | + return { "name" : name } | |
59 | 62 | |
60 | 63 | if __name__ == "__main__": |
61 | 64 | input_shelve = sys.argv[1] |
LDA/04a-mmdf.py
1 | + | |
2 | +# coding: utf-8 | |
3 | + | |
4 | +# In[29]: | |
5 | + | |
6 | +# Import | |
7 | +import itertools | |
8 | +import shelve | |
9 | +import pickle | |
10 | +import numpy | |
11 | +import scipy | |
12 | +from scipy import sparse | |
13 | +import scipy.sparse | |
14 | +import scipy.io | |
15 | +from mlp import * | |
16 | +import mlp | |
17 | +import sys | |
18 | +import utils | |
19 | +import dill | |
20 | +from collections import Counter | |
21 | +from gensim.models import LdaModel | |
22 | + | |
23 | + | |
24 | + | |
25 | +# In[3]: | |
26 | + | |
27 | +#30_50_50_150_0.0001 | |
28 | + | |
29 | +# In[4]: | |
30 | + | |
31 | +#db=shelve.open("SPELIKE_MLP_DB.shelve",writeback=True) | |
32 | +origin_corps=shelve.open("{}".format(sys.argv[2])) | |
33 | +in_dir = sys.argv[1] | |
34 | + | |
35 | + | |
36 | +out_db=shelve.open("{}/mlp_scores.shelve".format(in_dir),writeback=True) | |
37 | + | |
38 | +mlp_h = [ 250, 250 ] | |
39 | +mlp_loss = "categorical_crossentropy" | |
40 | +mlp_dropouts = [0.25]* len(mlp_h) | |
41 | +mlp_sgd = Adam(lr=0.0001) | |
42 | +mlp_epochs = 3000 | |
43 | +mlp_batch_size = 1 | |
44 | +mlp_input_activation = "relu" | |
45 | +mlp_output_activation="softmax" | |
46 | + | |
47 | +ress = [] | |
48 | +for key in ["TRS", "ASR"] : | |
49 | + | |
50 | + res=mlp.train_mlp(origin_corps["LDA"][key]["TRAIN"],origin_corps["LABEL"][key]["TRAIN"], | |
51 | + origin_corps["LDA"][key]["DEV"],origin_corps["LABEL"][key]["DEV"], | |
52 | + origin_corps["LDA"][key]["TEST"],origin_corps["LABEL"][key]["TEST"], | |
53 | + mlp_h,dropouts=mlp_dropouts,sgd=mlp_sgd, | |
54 | + epochs=mlp_epochs, | |
55 | + batch_size=mlp_batch_size, | |
56 | + save_pred=False,keep_histo=False, | |
57 | + loss="categorical_crossentropy",fit_verbose=0) | |
58 | + arg_best=[] | |
59 | + dev_best=[] | |
60 | + arg_best.append(numpy.argmax(res[1])) | |
61 | + dev_best.append(res[1][arg_best[-1]]) | |
62 | + res[1][arg_best[-1]]=0 | |
63 | + arg_best.append(numpy.argmax(res[1])) | |
64 | + dev_best.append(res[1][arg_best[-1]]) | |
65 | + res[1][arg_best[-1]]=0 | |
66 | + arg_best.append(numpy.argmax(res[1])) | |
67 | + dev_best.append(res[1][arg_best[-1]]) | |
68 | + res[1][arg_best[-1]]=0 | |
69 | + arg_best.append(numpy.argmax(res[1])) | |
70 | + dev_best.append(res[1][arg_best[-1]]) | |
71 | + res[1][arg_best[-1]]=0 | |
72 | + arg_best.append(numpy.argmax(res[1])) | |
73 | + dev_best.append(res[1][arg_best[-1]]) | |
74 | + res[1][arg_best[-1]]=0 | |
75 | + arg_best.append(numpy.argmax(res[1])) | |
76 | + dev_best.append(res[1][arg_best[-1]]) | |
77 | + res[1][arg_best[-1]]=0 | |
78 | + arg_best.append(numpy.argmax(res[1])) | |
79 | + dev_best.append(res[1][arg_best[-1]]) | |
80 | + res[1][arg_best[-1]]=0 | |
81 | + arg_best.append(numpy.argmax(res[1])) | |
82 | + dev_best.append(res[1][arg_best[-1]]) | |
83 | + res[1][arg_best[-1]]=0 | |
84 | + arg_best.append(numpy.argmax(res[1])) | |
85 | + dev_best.append(res[1][arg_best[-1]]) | |
86 | + res[1][arg_best[-1]]=0 | |
87 | + arg_best.append(numpy.argmax(res[1])) | |
88 | + dev_best.append(res[1][arg_best[-1]]) | |
89 | + res[1][arg_best[-1]]=0 | |
90 | + arg_best.append(numpy.argmax(res[1])) | |
91 | + dev_best.append(res[1][arg_best[-1]]) | |
92 | + res[1][arg_best[-1]]=0 | |
93 | + arg_best.append(numpy.argmax(res[1])) | |
94 | + dev_best.append(res[1][arg_best[-1]]) | |
95 | + res[1][arg_best[-1]]=0 | |
96 | + | |
97 | + | |
98 | + | |
99 | + | |
100 | + test_best =[ res[2][x] for x in arg_best ] | |
101 | + test_max = numpy.max(res[2]) | |
102 | + out_db[key]=(res,(dev_best,test_best,test_max)) | |
103 | + ress.append((key,dev_best,test_best,test_max)) | |
104 | + | |
105 | +for el in ress : | |
106 | + print el | |
107 | +out_db.close() | |
108 | +origin_corps.close() |
LDA/04b-mmf_mini_ae.py
1 | + | |
2 | +# coding: utf-8 | |
3 | + | |
4 | +# In[2]: | |
5 | + | |
6 | +# Import | |
7 | +import gensim | |
8 | +from scipy import sparse | |
9 | +import itertools | |
10 | +from sklearn import preprocessing | |
11 | +from keras.models import Sequential | |
12 | +from keras.optimizers import SGD,Adam | |
13 | +from mlp import * | |
14 | +import sklearn.metrics | |
15 | +import shelve | |
16 | +import pickle | |
17 | +from utils import * | |
18 | +import sys | |
19 | +import os | |
20 | +import json | |
21 | +# In[4]: | |
22 | + | |
23 | +infer_model=shelve.open("{}".format(sys.argv[2])) | |
24 | +in_dir = sys.argv[1] | |
25 | +#['ASR', 'TRS', 'LABEL'] | |
26 | +# In[6]: | |
27 | + | |
28 | + | |
29 | +hidden_size=[ 100 , 50, 100 ] | |
30 | +input_activation="tanh" | |
31 | +output_activation="tanh" | |
32 | +loss="mse" | |
33 | +epochs=1000 | |
34 | +batch=1 | |
35 | +patience=60 | |
36 | +do_do=[False] | |
37 | +sgd = Adam(lr=0.000001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | |
38 | + | |
39 | + | |
40 | + | |
41 | +mlp_h = [ 150 ,150 ,150 ] | |
42 | +mlp_loss = "categorical_crossentropy" | |
43 | +mlp_dropouts = [] | |
44 | +mlp_sgd = Adam(lr=0.0001) | |
45 | +mlp_epochs = 2000 | |
46 | +mlp_batch_size = 8 | |
47 | +mlp_output_activation="softmax" | |
48 | + | |
49 | +try : | |
50 | + sgd_repr=sgd.get_config()["name"] | |
51 | +except AttributeError : | |
52 | + sgd_repr=sgd | |
53 | + | |
54 | +try : | |
55 | + mlp_sgd_repr=mlp_sgd.get_config()["name"] | |
56 | +except AttributeError : | |
57 | + mlp_sgd_repr=mlp_sgd | |
58 | + | |
59 | + | |
60 | +params={ "h1" : "_".join([ str(x) for x in hidden_size ]), | |
61 | + "inside_activation" : input_activation, | |
62 | + "output_activation" : output_activation, | |
63 | + "do_dropout": "_".join([str(x) for x in do_do]), | |
64 | + "loss" : loss, | |
65 | + "epochs" : epochs , | |
66 | + "batch_size" : batch, | |
67 | + "patience" : patience, | |
68 | + "sgd" : sgd_repr, | |
69 | + "mlp_h ": "_".join([str(x) for x in mlp_h]), | |
70 | + "mlp_loss ": mlp_loss, | |
71 | + "mlp_dropouts ": "_".join([str(x) for x in mlp_dropouts]), | |
72 | + "mlp_sgd ": mlp_sgd_repr, | |
73 | + "mlp_epochs ": mlp_epochs, | |
74 | + "mlp_batch_size ": mlp_batch_size, | |
75 | + "mlp_output" : mlp_output_activation | |
76 | + } | |
77 | +name = "_".join([ str(x) for x in params.values()]) | |
78 | +try: | |
79 | + os.mkdir("{}/{}".format(in_dir,name)) | |
80 | +except: | |
81 | + pass | |
82 | +db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True) | |
83 | +db["params"] = params | |
84 | +db["LABEL"]=infer_model["LABEL"] | |
85 | +# | |
86 | +json.dump(params, | |
87 | + open("{}/{}/ae_model.json".format(in_dir,name),"w"), | |
88 | + indent=4) | |
89 | + | |
90 | +keys = ["ASR","TRS"] | |
91 | + | |
92 | +db["AE"] = {} | |
93 | +db["LDA"] = {} | |
94 | +for mod in keys : | |
95 | + print mod | |
96 | + db["LDA"][mod] = train_mlp(infer_model["LDA"][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"], | |
97 | + infer_model["LDA"][mod]["DEV"],infer_model["LABEL"][mod]["DEV"], | |
98 | + infer_model["LDA"][mod]["TEST"],infer_model["LABEL"][mod]["TEST"], | |
99 | + mlp_h ,sgd=mlp_sgd, | |
100 | + epochs=mlp_epochs, | |
101 | + batch_size=mlp_batch_size, | |
102 | + input_activation=input_activation, | |
103 | + output_activation=mlp_output_activation, | |
104 | + dropouts=mlp_dropouts, | |
105 | + fit_verbose=0) | |
106 | + | |
107 | + res=train_ae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"], | |
108 | + hidden_size,patience = params["patience"],sgd=sgd, | |
109 | + dropouts=do_do,input_activation=input_activation,output_activation=output_activation, | |
110 | + loss=loss,epochs=epochs,batch_size=batch,verbose=0) | |
111 | + mlp_res_list=[] | |
112 | + for layer in res : | |
113 | + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], | |
114 | + layer[1],infer_model["LABEL"][mod]["DEV"], | |
115 | + layer[2],infer_model["LABEL"][mod]["TEST"], | |
116 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, | |
117 | + output_activation=mlp_output_activation, | |
118 | + input_activation=input_activation, | |
119 | + batch_size=mlp_batch_size,fit_verbose=0)) | |
120 | + db["AE"][mod]=mlp_res_list | |
121 | + | |
122 | +mod = "ASR" | |
123 | +mod2= "TRS" | |
124 | +mlp_res_list=[] | |
125 | + | |
126 | +res = train_ae(infer_model["LDA"][mod]["TRAIN"], | |
127 | + infer_model["LDA"][mod]["DEV"], | |
128 | + infer_model["LDA"][mod]["TEST"], | |
129 | + hidden_size,dropouts=do_do,patience = params["patience"], | |
130 | + sgd=sgd,input_activation=input_activation,output_activation=output_activation,loss=loss,epochs=epochs, | |
131 | + batch_size=batch, | |
132 | + y_train=infer_model["LDA"][mod]["TRAIN"], | |
133 | + y_dev=infer_model["LDA"][mod2]["DEV"], | |
134 | + y_test=infer_model["LDA"][mod2]["TEST"]) | |
135 | + | |
136 | +for layer in res : | |
137 | + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], | |
138 | + layer[1],infer_model["LABEL"][mod]["DEV"], | |
139 | + layer[2],infer_model["LABEL"][mod]["TEST"], | |
140 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, | |
141 | + output_activation=mlp_output_activation, | |
142 | + input_activation=input_activation, | |
143 | + batch_size=mlp_batch_size,fit_verbose=0)) | |
144 | + | |
145 | +db["AE"]["SPE"] = mlp_res_list | |
146 | + | |
147 | +db.sync() | |
148 | +db.close() |
LDA/04c-mmf_sae.py
1 | + | |
2 | +# coding: utf-8 | |
3 | + | |
4 | +# In[2]: | |
5 | + | |
6 | +# Import | |
7 | +import gensim | |
8 | +from scipy import sparse | |
9 | +import itertools | |
10 | +from sklearn import preprocessing | |
11 | +from keras.models import Sequential | |
12 | +from keras.optimizers import SGD,Adam | |
13 | +from mlp import * | |
14 | +import mlp | |
15 | +import sklearn.metrics | |
16 | +import shelve | |
17 | +import pickle | |
18 | +from utils import * | |
19 | +import sys | |
20 | +import os | |
21 | +import json | |
22 | +# In[4]: | |
23 | + | |
24 | +infer_model=shelve.open("{}".format(sys.argv[2])) | |
25 | +in_dir = sys.argv[1] | |
26 | +#['ASR', 'TRS', 'LABEL'] | |
27 | +# In[6]: | |
28 | + | |
29 | + | |
30 | +hidden_size=[ 100, 80, 50 , 20 ] | |
31 | +input_activation="relu" | |
32 | +output_activation="relu" | |
33 | +loss="mse" | |
34 | +epochs=3000 | |
35 | +batch=1 | |
36 | +patience=20 | |
37 | +do_do=[ 0 ] * len(hidden_size) | |
38 | +sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | |
39 | +try : | |
40 | + sgd_repr=sgd.get_config()["name"] | |
41 | +except AttributeError : | |
42 | + sgd_repr=sgd | |
43 | + | |
44 | +params={ "h1" : "_".join([str(x) for x in hidden_size]), | |
45 | + "inside_activation" : input_activation, | |
46 | + "out_activation" : output_activation, | |
47 | + "do_dropout": "_".join([str(x) for x in do_do]), | |
48 | + "loss" : loss, | |
49 | + "epochs" : epochs , | |
50 | + "batch_size" : batch, | |
51 | + "patience" : patience, | |
52 | + "sgd" : sgd_repr} | |
53 | +name = "_".join([ str(x) for x in params.values()]) | |
54 | +try: | |
55 | + os.mkdir("{}/SAE_{}".format(in_dir,name)) | |
56 | +except: | |
57 | + pass | |
58 | +db = shelve.open("{}/SAE_{}/ae_model.shelve".format(in_dir,name),writeback=True) | |
59 | +# | |
60 | +json.dump(params, | |
61 | + open("{}/SAE_{}/ae_model.json".format(in_dir,name),"w"), | |
62 | + indent=4) | |
63 | + | |
64 | +keys = ["ASR","TRS"] | |
65 | + | |
66 | +mlp_h = [ 150 , 300 ] | |
67 | +mlp_loss ="categorical_crossentropy" | |
68 | +mlp_dropouts = [0,0,0,0] | |
69 | +mlp_sgd = Adam(0.001) | |
70 | +mlp_epochs = 2000 | |
71 | +mlp_batch_size = 8 | |
72 | + | |
73 | +db["SAE"] = {} | |
74 | + | |
75 | +db["SAEFT"] = {} | |
76 | +for mod in keys : | |
77 | + print "MODE ", mod | |
78 | + res_tuple=train_sae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"], | |
79 | + infer_model["LDA"][mod]["TEST"], | |
80 | + hidden_size,dropouts=do_do, | |
81 | + patience = params["patience"],sgd=sgd,input_activation="tanh", | |
82 | + output_activation="tanh",loss=loss,epochs=epochs, | |
83 | + batch_size=batch,verbose=0) | |
84 | + #print len(res), [len(x) for x in res[0]], [ len(x) for x in res[1]] | |
85 | + for name , levels in zip(["SAE","SAEFT"],res_tuple): | |
86 | + print "NAME", name | |
87 | + mlp_res_by_level = [] | |
88 | + for res in levels: | |
89 | + mlp_res_list=[] | |
90 | + for nb,layer in enumerate(res) : | |
91 | + print "layer NB",nb | |
92 | + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], | |
93 | + layer[1],infer_model["LABEL"][mod]["DEV"], | |
94 | + layer[2],infer_model["LABEL"][mod]["TEST"], | |
95 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, | |
96 | + sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size, | |
97 | + fit_verbose=0)) | |
98 | + mlp_res_by_level.append(mlp_res_list) | |
99 | + db[name][mod]=mlp_res_by_level | |
100 | + | |
101 | +mod = "ASR" | |
102 | +mod2= "TRS" | |
103 | +print "mode SPE " | |
104 | +res_tuple = train_sae(infer_model["LDA"][mod]["TRAIN"], | |
105 | + infer_model["LDA"][mod]["DEV"], | |
106 | + infer_model["LDA"][mod]["TEST"], | |
107 | + hidden_size,dropouts=[0],patience=params["patience"], | |
108 | + sgd=sgd,input_activation=input_activation,output_activation=input_activation, | |
109 | + loss=loss,epochs=epochs,batch_size=batch, | |
110 | + y_train=infer_model["LDA"][mod2]["TRAIN"], | |
111 | + y_dev=infer_model["LDA"][mod2]["DEV"], | |
112 | + y_test=infer_model["LDA"][mod2]["TEST"]) | |
113 | + | |
114 | +for name , levels in zip(["SAE","SAEFT"],res_tuple): | |
115 | + mlp_res_by_level = [] | |
116 | + for res in levels : | |
117 | + mlp_res_list=[] | |
118 | + for layer in res : | |
119 | + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], | |
120 | + layer[1],infer_model["LABEL"][mod]["DEV"],layer[2], | |
121 | + infer_model["LABEL"][mod]["TEST"], | |
122 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, | |
123 | + sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size, | |
124 | + fit_verbose=0)) | |
125 | + mlp_res_by_level.append(mlp_res_list) | |
126 | + db[name]["SPE"] = mlp_res_by_level | |
127 | + | |
128 | +db.close() |
LDA/04d-mmf_dsae.py
1 | + | |
2 | +# coding: utf-8 | |
3 | + | |
4 | +# In[2]: | |
5 | + | |
6 | +# Import | |
7 | +import gensim | |
8 | +from scipy import sparse | |
9 | +import itertools | |
10 | +from sklearn import preprocessing | |
11 | +from keras.models import Sequential | |
12 | +from keras.optimizers import SGD,Adam | |
13 | +from mlp import * | |
14 | +import mlp | |
15 | +import sklearn.metrics | |
16 | +import shelve | |
17 | +import pickle | |
18 | +from utils import * | |
19 | +import sys | |
20 | +import os | |
21 | +import json | |
22 | +# In[4]: | |
23 | + | |
24 | +infer_model=shelve.open("{}".format(sys.argv[2])) | |
25 | +in_dir = sys.argv[1] | |
26 | +#['ASR', 'TRS', 'LABEL'] | |
27 | +# In[6]: | |
28 | + | |
29 | +# AE params | |
30 | +hidden_size=[ 100, 100 ] | |
31 | +input_activation="relu" | |
32 | +output_activation="relu" | |
33 | +loss="mse" | |
34 | +epochs= 1000 | |
35 | +batch_size=1 | |
36 | +patience=20 | |
37 | +do_do=[ 0.25 ] * len(hidden_size) | |
38 | +sgd = Adam(lr=0.00001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | |
39 | +try : | |
40 | + sgd_repr=sgd.get_config()["name"] | |
41 | +except AttributeError : | |
42 | + sgd_repr=sgd | |
43 | + | |
44 | +# Transforme : | |
45 | +trans_hidden_size=[ 300 , 300 ] | |
46 | +trans_input_activation="relu" | |
47 | +trans_output_activation="relu" | |
48 | +trans_loss="mse" | |
49 | +trans_epochs=1000 | |
50 | +trans_batch_size=8 | |
51 | +trans_patience=20 | |
52 | +trans_do=[ 0.25 ] * len(trans_hidden_size) | |
53 | +trans_sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | |
54 | +try : | |
55 | + trans_sgd_repr=trans_sgd.get_config()["name"] | |
56 | +except AttributeError : | |
57 | + trans_sgd_repr=trans_sgd | |
58 | + | |
59 | + | |
60 | + | |
61 | +ae={ "h1" : "_".join([str(x) for x in hidden_size]), | |
62 | + "inside_activation" : input_activation, | |
63 | + "out_activation" : output_activation, | |
64 | + "do_dropout": "_".join([str(x) for x in do_do]), | |
65 | + "loss" : loss, | |
66 | + "epochs" : epochs , | |
67 | + "batch_size" : batch_size, | |
68 | + "patience" : patience, | |
69 | + "sgd" : sgd_repr} | |
70 | +name = "_".join([ str(x) for x in ae.values()]) | |
71 | + | |
72 | +trans={ "h1" : "_".join([str(x) for x in trans_hidden_size]), | |
73 | + "inside_activation" : trans_input_activation, | |
74 | + "out_activation" : trans_output_activation, | |
75 | + "do_dropout": "_".join([str(x) for x in trans_do]), | |
76 | + "loss" : trans_loss, | |
77 | + "epochs" : trans_epochs , | |
78 | + "batch_size" : trans_batch_size, | |
79 | + "patience" : trans_patience, | |
80 | + "sgd" : trans_sgd_repr} | |
81 | + | |
82 | +mlp_h = [ 300 , 300 ] | |
83 | +mlp_loss ="categorical_crossentropy" | |
84 | +mlp_dropouts = [0,0,0,0] | |
85 | +mlp_sgd = Adam(0.0001) | |
86 | +mlp_epochs = 1000 | |
87 | +mlp_batch_size = 8 | |
88 | +mlp_input_activation = "relu" | |
89 | +mlp_output_activation = "softmax" | |
90 | + | |
91 | +try : | |
92 | + mlp_sgd_repr=mlp_sgd.get_config()["name"] | |
93 | +except AttributeError : | |
94 | + mlp_sgd_repr=mlp_sgd | |
95 | + | |
96 | + | |
97 | + | |
98 | +mlp={ "h1" : "_".join([str(x) for x in mlp_h ]), | |
99 | + "inside_activation" : mlp_input_activation, | |
100 | + "out_activation" : mlp_output_activation, | |
101 | + "do_dropout": "_".join([str(x) for x in mlp_dropouts]), | |
102 | + "loss" : mlp_loss, | |
103 | + "epochs" : mlp_epochs , | |
104 | + "batch_size" : mlp_batch_size, | |
105 | + "sgd" : mlp_sgd_repr} | |
106 | + | |
107 | +params = { "ae":ae, "trans":trans, "mlp":mlp} | |
108 | +try: | |
109 | + os.mkdir("{}/DSAE_{}".format(in_dir,name)) | |
110 | +except: | |
111 | + pass | |
112 | +db = shelve.open("{}/DSAE_{}/ae_model.shelve".format(in_dir,name),writeback=True) | |
113 | +# | |
114 | +json.dump(params, | |
115 | + open("{}/DSAE_{}/ae_model.json".format(in_dir,name),"w"), | |
116 | + indent=4) | |
117 | + | |
118 | +keys = ["ASR","TRS"] | |
119 | + | |
120 | + | |
121 | + | |
122 | +db["DSAE"] = {} | |
123 | + | |
124 | +db["DSAEFT"] = {} | |
125 | +mod = "ASR" | |
126 | +res_tuple_ASR = train_ae(infer_model["LDA"][mod]["TRAIN"], | |
127 | + infer_model["LDA"][mod]["DEV"], | |
128 | + infer_model["LDA"][mod]["TEST"], | |
129 | + hidden_size,dropouts=do_do, | |
130 | + patience = patience,sgd=sgd, | |
131 | + input_activation=input_activation, | |
132 | + output_activation=output_activation,loss=loss,epochs=epochs, | |
133 | + batch_size=batch_size,verbose=0,get_weights=True) | |
134 | +mlp_res_list = [] | |
135 | +for layer in res_tuple_ASR[0]: | |
136 | + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], | |
137 | + layer[1],infer_model["LABEL"][mod]["DEV"], | |
138 | + layer[2],infer_model["LABEL"][mod]["TEST"], | |
139 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, | |
140 | + sgd=mlp_sgd,epochs=mlp_epochs, | |
141 | + output_activation=mlp_output_activation, | |
142 | + input_activation=mlp_input_activation, | |
143 | + batch_size=mlp_batch_size,fit_verbose=0)) | |
144 | + | |
145 | +db["DSAE"][mod] = mlp_res_list | |
146 | +mod = "TRS" | |
147 | +print hidden_size | |
148 | +res_tuple_TRS = train_ae(infer_model["LDA"][mod]["TRAIN"], | |
149 | + infer_model["LDA"][mod]["DEV"], | |
150 | + infer_model["LDA"][mod]["TEST"], | |
151 | + hidden_size,dropouts=do_do, | |
152 | + sgd=sgd,input_activation=input_activation, | |
153 | + output_activation=output_activation,loss=loss,epochs=epochs, | |
154 | + batch_size=batch_size,patience=patience, | |
155 | + verbose=0,get_weights=True) | |
156 | + | |
157 | +mlp_res_list = [] | |
158 | +for layer in res_tuple_TRS[0]: | |
159 | + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], | |
160 | + layer[1],infer_model["LABEL"][mod]["DEV"], | |
161 | + layer[2],infer_model["LABEL"][mod]["TEST"], | |
162 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, | |
163 | + sgd=mlp_sgd,epochs=mlp_epochs, | |
164 | + output_activation=mlp_output_activation, | |
165 | + input_activation=mlp_input_activation, | |
166 | + batch_size=mlp_batch_size,fit_verbose=0)) | |
167 | + | |
168 | +db["DSAE"][mod] = mlp_res_list | |
169 | + | |
170 | + | |
171 | + | |
172 | +transfert = [] | |
173 | + | |
174 | +print " get weight trans" | |
175 | + | |
176 | +for asr_pred, trs_pred in zip(res_tuple_ASR[0], res_tuple_TRS[0]): | |
177 | + print "ASR", [ x.shape for x in asr_pred] | |
178 | + | |
179 | + print "TRS", [ x.shape for x in trs_pred] | |
180 | ||
181 | + | |
182 | +for asr_pred, trs_pred in zip(res_tuple_ASR[0], res_tuple_TRS[0]): | |
183 | + print "ASR", [ x.shape for x in asr_pred] | |
184 | + | |
185 | + print "TRS", [ x.shape for x in trs_pred] | |
186 | + transfert.append( train_ae(asr_pred[0], | |
187 | + asr_pred[1], | |
188 | + asr_pred[2], | |
189 | + trans_hidden_size, | |
190 | + dropouts=trans_do, | |
191 | + y_train = trs_pred[0], | |
192 | + y_dev=trs_pred[1], | |
193 | + y_test = trs_pred[2], | |
194 | + patience = trans_patience,sgd=trans_sgd, | |
195 | + input_activation=trans_input_activation, | |
196 | + output_activation=trans_output_activation, | |
197 | + loss=trans_loss, | |
198 | + epochs=trans_epochs, | |
199 | + batch_size=trans_batch_size,verbose=0,get_weights=True) ) | |
200 | +mod = "ASR" | |
201 | +mlp_res_bylvl = [] | |
202 | +print " MLP on transfert " | |
203 | +for level, w in transfert : | |
204 | + mlp_res_list = [] | |
205 | + for layer in level : | |
206 | + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], | |
207 | + layer[1],infer_model["LABEL"][mod]["DEV"], | |
208 | + layer[2],infer_model["LABEL"][mod]["TEST"], | |
209 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, | |
210 | + sgd=mlp_sgd,epochs=mlp_epochs, | |
211 | + output_activation=mlp_output_activation, | |
212 | + input_activation=mlp_input_activation, | |
213 | + batch_size=mlp_batch_size,fit_verbose=0)) | |
214 | + mlp_res_bylvl.append(mlp_res_list) | |
215 | +db["DSAE"]["transfert"] = mlp_res_bylvl | |
216 | + | |
217 | + | |
218 | +print " FT " | |
219 | +WA = res_tuple_ASR[1] | |
220 | +print "WA", len(WA), [ len(x) for x in WA] | |
221 | +WT = res_tuple_TRS[1] | |
222 | + | |
223 | +print "WT", len(WT), [ len(x) for x in WT] | |
224 | +Wtr = [ x[1] for x in transfert] | |
225 | + | |
226 | +print "Wtr", len(Wtr), [ len(x) for x in Wtr],[ len(x[1]) for x in Wtr] | |
227 | + | |
228 | +ft_res = ft_dsae(infer_model["LDA"]["ASR"]["TRAIN"], | |
229 | + infer_model["LDA"]["ASR"]["DEV"], | |
230 | + infer_model["LDA"]["ASR"]["TEST"], | |
231 | + y_train=infer_model["LDA"]["TRS"]["TRAIN"], | |
232 | + y_dev=infer_model["LDA"]["TRS"]["DEV"], | |
233 | + y_test=infer_model["LDA"]["TRS"]["TEST"], | |
234 | + ae_hidden = hidden_size, | |
235 | + transfer_hidden = trans_hidden_size, | |
236 | + start_weights = WA, | |
237 | + transfer_weights = Wtr, | |
238 | + end_weights = WT, | |
239 | + input_activation = input_activation, | |
240 | + output_activation = output_activation, | |
241 | + ae_dropouts= do_do, | |
242 | + transfer_do = trans_do, | |
243 | + sgd = sgd, | |
244 | + loss = loss , | |
245 | + patience = patience, | |
246 | + batch_size = batch_size, | |
247 | + epochs= epochs) | |
248 | +mlps_by_lvls= [] | |
249 | +for level in ft_res : | |
250 | + mlp_res_list = [] | |
251 | + for layer in level : | |
252 | + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], | |
253 | + layer[1],infer_model["LABEL"][mod]["DEV"], | |
254 | + layer[2],infer_model["LABEL"][mod]["TEST"], | |
255 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, | |
256 | + sgd=mlp_sgd,epochs=mlp_epochs, | |
257 | + output_activation=mlp_output_activation, | |
258 | + input_activation=mlp_input_activation, | |
259 | + batch_size=mlp_batch_size,fit_verbose=0)) | |
260 | + mlps_by_lvls.append(mlp_res_list) | |
261 | + | |
262 | + | |
263 | +db["DSAEFT"]["transfert"] = mlps_by_lvls | |
264 | + | |
265 | +db.close() |
LDA/04e-mm_vae.py
1 | + | |
2 | +# coding: utf-8 | |
3 | + | |
4 | +# In[2]: | |
5 | + | |
6 | +# Import | |
7 | +import gensim | |
8 | +from scipy import sparse | |
9 | +import itertools | |
10 | +from sklearn import preprocessing | |
11 | +from keras.models import Sequential | |
12 | +from keras.optimizers import SGD,Adam | |
13 | +from mlp import * | |
14 | +from vae import * | |
15 | +import sklearn.metrics | |
16 | +import shelve | |
17 | +import pickle | |
18 | +from utils import * | |
19 | +import sys | |
20 | +import os | |
21 | +import json | |
22 | +# In[4]: | |
23 | + | |
24 | +infer_model=shelve.open("{}".format(sys.argv[2])) | |
25 | +in_dir = sys.argv[1] | |
26 | +#['ASR', 'TRS', 'LABEL'] | |
27 | +# In[6]: | |
28 | + | |
29 | + | |
30 | +hidden_size= [60] | |
31 | +input_activation="tanh" | |
32 | +output_activation="sigmoid" | |
33 | +epochs=300 | |
34 | +batch=1 | |
35 | +patience=60 | |
36 | +sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | |
37 | +latent_dim = 30 | |
38 | + | |
39 | + | |
40 | + | |
41 | +mlp_h = [ 256 ] | |
42 | +mlp_loss = "categorical_crossentropy" | |
43 | +mlp_dropouts = [] | |
44 | +mlp_sgd = Adam(lr=0.001) | |
45 | +mlp_epochs = 1000 | |
46 | +mlp_batch_size = 16 | |
47 | +mlp_output_activation="softmax" | |
48 | + | |
49 | +try : | |
50 | + sgd_repr=sgd.get_config()["name"] | |
51 | +except AttributeError : | |
52 | + sgd_repr=sgd | |
53 | + | |
54 | +try : | |
55 | + mlp_sgd_repr=mlp_sgd.get_config()["name"] | |
56 | +except AttributeError : | |
57 | + mlp_sgd_repr=mlp_sgd | |
58 | + | |
59 | + | |
60 | +params={ "h1" : "_".join([ str(x) for x in hidden_size ]), | |
61 | + "inside_activation" : input_activation, | |
62 | + "output_activation" : output_activation, | |
63 | + "epochs" : epochs , | |
64 | + "batch_size" : batch, | |
65 | + "patience" : patience, | |
66 | + "sgd" : sgd_repr, | |
67 | + "mlp_h ": "_".join([str(x) for x in mlp_h]), | |
68 | + "mlp_loss ": mlp_loss, | |
69 | + "mlp_dropouts ": "_".join([str(x) for x in mlp_dropouts]), | |
70 | + "mlp_sgd ": mlp_sgd_repr, | |
71 | + "mlp_epochs ": mlp_epochs, | |
72 | + "mlp_batch_size ": mlp_batch_size, | |
73 | + "mlp_output" : mlp_output_activation | |
74 | + } | |
75 | +name = "_".join([ str(x) for x in params.values()]) | |
76 | +try: | |
77 | + os.mkdir("{}/VAE_{}".format(in_dir,name)) | |
78 | +except: | |
79 | + pass | |
80 | +db = shelve.open("{}/VAE_{}/ae_model.shelve".format(in_dir,name),writeback=True) | |
81 | +db["params"] = params | |
82 | +db["LABEL"]=infer_model["LABEL"] | |
83 | +# | |
84 | +json.dump(params, | |
85 | + open("{}/VAE_{}/ae_model.json".format(in_dir,name),"w"), | |
86 | + indent=4) | |
87 | + | |
88 | +keys = ["ASR","TRS"] | |
89 | + | |
90 | +db["VAE"] = {} | |
91 | +db["LDA"] = {} | |
92 | +for mod in keys : | |
93 | + print mod | |
94 | + db["LDA"][mod] = train_mlp(infer_model["LDA"][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"], | |
95 | + infer_model["LDA"][mod]["DEV"],infer_model["LABEL"][mod]["DEV"], | |
96 | + infer_model["LDA"][mod]["TEST"],infer_model["LABEL"][mod]["TEST"], | |
97 | + mlp_h ,sgd=mlp_sgd, | |
98 | + epochs=mlp_epochs, | |
99 | + batch_size=mlp_batch_size, | |
100 | + input_activation=input_activation, | |
101 | + output_activation=mlp_output_activation, | |
102 | + dropouts=mlp_dropouts, | |
103 | + fit_verbose=0) | |
104 | + | |
105 | + res=train_vae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"], | |
106 | + hidden_size=hidden_size[0], | |
107 | + latent_dim=latent_dim,sgd=sgd, | |
108 | + input_activation=input_activation,output_activation=output_activation, | |
109 | + nb_epochs=epochs,batch_size=batch) | |
110 | + mlp_res_list=[] | |
111 | + for layer in res : | |
112 | + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], | |
113 | + layer[1],infer_model["LABEL"][mod]["DEV"], | |
114 | + layer[2],infer_model["LABEL"][mod]["TEST"], | |
115 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, | |
116 | + output_activation=mlp_output_activation, | |
117 | + input_activation=input_activation, | |
118 | + batch_size=mlp_batch_size,fit_verbose=0)) | |
119 | + db["VAE"][mod]=mlp_res_list | |
120 | + | |
121 | +mod = "ASR" | |
122 | +mod2= "TRS" | |
123 | +mlp_res_list=[] | |
124 | + | |
125 | +res = train_vae(infer_model["LDA"][mod]["TRAIN"], | |
126 | + infer_model["LDA"][mod]["DEV"], | |
127 | + infer_model["LDA"][mod]["TEST"], | |
128 | + hidden_size=hidden_size[0], | |
129 | + sgd=sgd,input_activation=input_activation,output_activation=output_activation, | |
130 | + latent_dim=latent_dim, | |
131 | + nb_epochs=epochs, | |
132 | + batch_size=batch, | |
133 | + y_train=infer_model["LDA"][mod2]["TRAIN"], | |
134 | + y_dev=infer_model["LDA"][mod2]["DEV"], | |
135 | + y_test=infer_model["LDA"][mod2]["TEST"]) | |
136 | + | |
137 | +for layer in res : | |
138 | + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], | |
139 | + layer[1],infer_model["LABEL"][mod]["DEV"], | |
140 | + layer[2],infer_model["LABEL"][mod]["TEST"], | |
141 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, | |
142 | + output_activation=mlp_output_activation, | |
143 | + input_activation=input_activation, | |
144 | + batch_size=mlp_batch_size,fit_verbose=0)) | |
145 | + | |
146 | +db["VAE"]["SPE"] = mlp_res_list | |
147 | + | |
148 | +db.sync() | |
149 | +db.close() |
LDA/05-mmf_getscore.py
1 | +import numpy as np | |
2 | +import shelve | |
3 | +import sys | |
4 | +import glob | |
5 | +from collections import defaultdict | |
6 | +from tinydb import TinyDB, Query | |
7 | +from mako.template import Template | |
8 | +import time | |
9 | + | |
10 | +def get_best(x): | |
11 | + argbest=np.argmax(x[1]) | |
12 | + maxdev=x[1][argbest] | |
13 | + maxtrain=np.max(x[0]) | |
14 | + maxtest=np.max(x[2]) | |
15 | + besttest=x[2][argbest] | |
16 | + return ( maxtrain,maxdev,maxtest,besttest) | |
17 | +depth = lambda L: isinstance(L, list) and max(map(depth, L))+1 | |
18 | + | |
19 | + | |
20 | +template_name = ''' | |
21 | +${name} | |
22 | +======================== | |
23 | + | |
24 | +MLP scores : | |
25 | +------------------- | |
26 | +''' | |
27 | +template_value='''\n\n | |
28 | +| ${model} ${ttype} | train | dev |max test| best test| | |
29 | +| -------------------:|:--------:|:---------:|:------:|:--------:| | |
30 | +% for cpt,line in enumerate(models[model][ttype]): | |
31 | +| ${cpt} | ${line[0]} | ${line[1]} |${line[2]} | ${line[3]} | | |
32 | +% endfor | |
33 | +\n | |
34 | +''' | |
35 | + | |
36 | +# ae_model.shelve | |
37 | +def get_folder_file(x): | |
38 | + folder=x.split("/")[1] | |
39 | + shelve_file = ".".join(x.split(".")[:-1]) | |
40 | + return(folder,shelve_file) | |
41 | + | |
42 | +in_folder = sys.argv[1] | |
43 | + | |
44 | + | |
45 | +models = defaultdict(dict) | |
46 | + | |
47 | +ae_model_list = glob.glob("{}/*/ae_model.shelve.dir".format(in_folder)) | |
48 | +ae_model_list = sorted(ae_model_list) | |
49 | +ae_model_list= map(get_folder_file,ae_model_list) | |
50 | +for name , shelve_file in ae_model_list : | |
51 | + print Template(template_name).render(name=name) | |
52 | + opened_shelve = shelve.open(shelve_file) | |
53 | + keys = opened_shelve.keys() | |
54 | + if "LABEL" in keys : | |
55 | + keys.remove("LABEL") | |
56 | + if "params" in keys: | |
57 | + keys.remove("params") | |
58 | + to_print = [] | |
59 | + for working_key in keys: | |
60 | + for key in opened_shelve[working_key].keys(): | |
61 | + table_depth = depth(opened_shelve[working_key][key]) | |
62 | + if table_depth == 3 : | |
63 | + models[working_key][key] = [ get_best(x) for x in opened_shelve[working_key][key] ] | |
64 | + to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip()) | |
65 | + elif table_depth == 2 : | |
66 | + models[working_key][key] = [ get_best(opened_shelve[working_key][key]) ] | |
67 | + to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip()) | |
68 | + elif table_depth == 4 : | |
69 | + for layer in opened_shelve[working_key][key] : | |
70 | + models[working_key][key] = [ get_best(x) for x in layer ] | |
71 | + to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip()) | |
72 | + print "\n".join(to_print) |
LDA/run2.sh
1 | 1 | #python 00-prepross.py |
2 | -python 02-lda-order.py DECODA_list_wid.shelve output_v5/perplex.db 50 10 output_v5 50_500 0 1_0.1 1_0.1 500_1000 100_2000 | |
2 | +python 02-lda.py DECODA_list_wid.shelve output_v5/perplex.db 50 10 output_v5 50_500 0 1_0.1 1_0.1 500_1000 100_2000 | |
3 | 3 | #python 03-perplex.py DECODA_list_wid.shelve output_v5 output_v5/perplex.db |
4 | 4 | python 03-order_by_perp.py output_v5/perplex.db output_v5 |
5 | 5 | bash 04-run_mlp_ae.sh output_v5 DECODA_list_wid.shelve |
LDA/utils.py
LDA/vae.py
1 | +'''This script demonstrates how to build a variational autoencoder with Keras. | |
2 | +Reference: "Auto-Encoding Variational Bayes" https://arxiv.org/abs/1312.6114 | |
3 | +''' | |
4 | + | |
5 | +import itertools | |
6 | +import sys | |
7 | +import json | |
8 | + | |
9 | +import numpy as np | |
10 | +import matplotlib.pyplot as plt | |
11 | +from scipy import sparse | |
12 | +import scipy.io | |
13 | + | |
14 | +from keras.layers import Input, Dense, Lambda | |
15 | +from keras.models import Model | |
16 | +from keras import backend as K | |
17 | +from keras import objectives | |
18 | +from keras.datasets import mnist | |
19 | + | |
20 | +import pandas | |
21 | +import shelve | |
22 | +import pickle | |
23 | + | |
24 | + | |
25 | + | |
26 | + | |
27 | + | |
28 | +#batch_size = 16 | |
29 | +#original_dim = 784 | |
30 | +#latent_dim = 2 | |
31 | +#intermediate_dim = 128 | |
32 | +#epsilon_std = 0.01 | |
33 | +#nb_epoch = 40 | |
34 | + | |
35 | + | |
36 | + | |
37 | + | |
38 | +def train_vae(x_train,x_dev,x_test,y_train=None,y_dev=None,y_test=None,hidden_size=80,latent_dim=12,batch_size=8,nb_epochs=10,sgd="rmsprop",input_activation = "relu",output_activation = "sigmoid",epsilon_std=0.01): | |
39 | + | |
40 | + | |
41 | + | |
42 | + def sampling(args): | |
43 | + z_mean, z_log_std = args | |
44 | + epsilon = K.random_normal(shape=(batch_size, latent_dim), | |
45 | + mean=0., std=epsilon_std) | |
46 | + return z_mean + K.exp(z_log_std) * epsilon | |
47 | + | |
48 | + def vae_loss(x, x_decoded_mean): | |
49 | + xent_loss = objectives.binary_crossentropy(x, x_decoded_mean) | |
50 | + kl_loss = - 0.5 * K.mean(1 + z_log_std - K.square(z_mean) - K.exp(z_log_std), axis=-1) | |
51 | + return xent_loss + kl_loss | |
52 | + | |
53 | + original_dim = x_train.shape[1] | |
54 | + | |
55 | + | |
56 | + x = Input(batch_shape=(batch_size, original_dim)) | |
57 | + h = Dense(hidden_size, activation=input_activation)(x) | |
58 | + z_mean = Dense(latent_dim)(h) | |
59 | + z_log_std = Dense(latent_dim)(h) | |
60 | + | |
61 | + | |
62 | + # note that "output_shape" isn't necessary with the TensorFlow backend | |
63 | + # so you could write `Lambda(sampling)([z_mean, z_log_std])` | |
64 | + z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_std]) | |
65 | + | |
66 | + # we instantiate these layers separately so as to reuse them later | |
67 | + decoder_h = Dense(hidden_size, activation=input_activation) | |
68 | + decoder_mean = Dense(original_dim, activation=output_activation) | |
69 | + h_decoded = decoder_h(z) | |
70 | + x_decoded_mean = decoder_mean(h_decoded) | |
71 | + | |
72 | + | |
73 | + vae = Model(x, x_decoded_mean) | |
74 | + vae.compile(optimizer=sgd, loss=vae_loss) | |
75 | + | |
76 | + # train the VAE on MNIST digits | |
77 | + if y_train is None or y_dev is None or y_test is None : | |
78 | + y_train = x_train | |
79 | + y_dev = x_dev | |
80 | + y_test = x_test | |
81 | + | |
82 | + vae.fit(x_train, y_train, | |
83 | + shuffle=True, | |
84 | + nb_epoch=nb_epochs, | |
85 | + batch_size=batch_size, | |
86 | + validation_data=(x_dev, y_dev)) | |
87 | + | |
88 | + # build a model to project inputs on the latent space | |
89 | + encoder = Model(x, z_mean) | |
90 | + pred_train = encoder.predict(x_train, batch_size=batch_size) | |
91 | + pred_dev = encoder.predict(x_dev, batch_size=batch_size) | |
92 | + pred_test = encoder.predict(x_test,batch_size=batch_size) | |
93 | + return [ [ pred_train, pred_dev, pred_test ] ] | |
94 | +# display a 2D plot of the digit classes in the latent space | |
95 | + #x_test_encoded = encoder.predict(x_test, batch_size=batch_size) | |
96 | + # build a digit generator that can sample from the learned distribution | |
97 | + #decoder_input = Input(shape=(latent_dim,)) | |
98 | + #_h_decoded = decoder_h(decoder_input) | |
99 | + #_x_decoded_mean = decoder_mean(_h_decoded) | |
100 | + #generator = Model(decoder_input, _x_decoded_mean) | |
101 | + #x_decoded = generator.predict(z_sample) | |
102 | + |