Commit d1012a7a1689588ac0d1e4a716497562663c14c2
1 parent
ee9023b1c9
Exists in
master
update LDA/.py
Showing 7 changed files with 8 additions and 289 deletions Inline Diff
LDA/00-mmf_make_features.py
1 | import sys | 1 | import sys |
2 | import os | 2 | import os |
3 | 3 | ||
4 | import pandas | 4 | import pandas |
5 | import numpy | 5 | import numpy |
6 | import shelve | 6 | import shelve |
7 | 7 | ||
8 | from sklearn.preprocessing import LabelBinarizer | 8 | from sklearn.preprocessing import LabelBinarizer |
9 | 9 | ||
10 | from utils import select_mmf as select | 10 | from utils import select_mmf as select |
11 | 11 | ||
12 | input_dir = sys.argv[1] # Dossier de premire niveau contient ASR et TRS | 12 | input_dir = sys.argv[1] # Dossier de premire niveau contient ASR et TRS |
13 | level = sys.argv[2] # taille de LDA ( -5) voulu | 13 | level = sys.argv[2] # taille de LDA ( -5) voulu |
14 | output_dir = sys.argv[3] | 14 | output_dir = sys.argv[3] |
15 | 15 | ||
16 | lb=LabelBinarizer() | 16 | lb=LabelBinarizer() |
17 | #y_train=lb.fit_transform([utils.select(ligneid) for ligneid in origin_corps["LABEL"]["TRAIN"]]) | 17 | #y_train=lb.fit_transform([utils.select(ligneid) for ligneid in origin_corps["LABEL"]["TRAIN"]]) |
18 | 18 | ||
19 | 19 | ||
20 | data = shelve.open("{}/mmf_{}.shelve".format(output_dir,level),writeback=True) | 20 | data = shelve.open("{}/mmf_{}.shelve".format(output_dir,level),writeback=True) |
21 | data["LABEL"]= {} | 21 | data["LABEL"]= {} |
22 | data["LDA"] = {"ASR":{},"TRS":{}} | 22 | data["LDA"] = {"ASR":{},"TRS":{}} |
23 | for mod in ["ASR", "TRS" ]: | 23 | for mod in ["ASR", "TRS" ]: |
24 | train = pandas.read_table("{}/{}/train_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | 24 | train = pandas.read_table("{}/{}/train_{}.tab".format(input_dir, mod, level), sep=" ", header=None ) |
25 | dev = pandas.read_table("{}/{}/dev_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | 25 | dev = pandas.read_table("{}/{}/dev_{}.tab".format(input_dir, mod, level), sep=" ", header=None ) |
26 | test = pandas.read_table("{}/{}/test_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | 26 | test = pandas.read_table("{}/{}/test_{}.tab".format(input_dir, mod, level), sep=" ", header=None ) |
27 | 27 | ||
28 | y_train = train.iloc[:,0].apply(select) | 28 | y_train = train.iloc[:,0].apply(select) |
29 | y_dev = dev.iloc[:,0].apply(select) | 29 | y_dev = dev.iloc[:,0].apply(select) |
30 | y_test = test.iloc[:,0].apply(select) | 30 | y_test = test.iloc[:,0].apply(select) |
31 | lb.fit(y_train) | 31 | lb.fit(y_train) |
32 | data["LABEL"][mod]={"TRAIN":lb.transform(y_train),"DEV":lb.transform(y_dev), "TEST": lb.transform(y_test)} | 32 | data["LABEL"][mod]={"TRAIN":lb.transform(y_train),"DEV":lb.transform(y_dev), "TEST": lb.transform(y_test)} |
33 | 33 | ||
34 | # data["LDA"][mod]={'ASR':[]} | 34 | # data["LDA"][mod]={'ASR':[]} |
35 | print data["LDA"][mod] | ||
36 | print train.values | 35 | print train.values |
37 | data["LDA"][mod]["TRAIN"]=train.iloc[:,1:-1].values | 36 | data["LDA"][mod]["TRAIN"]=train.iloc[:,1:-1].values |
38 | data["LDA"][mod]["DEV"]=dev.iloc[:,1:-1].values | 37 | data["LDA"][mod]["DEV"]=dev.iloc[:,1:-1].values |
39 | data["LDA"][mod]["TEST"]=test.iloc[:,1:-1].values | 38 | data["LDA"][mod]["TEST"]=test.iloc[:,1:-1].values |
40 | 39 | ||
40 | print data["LDA"][mod]["TRAIN"].shape | ||
41 | data.sync() | 41 | data.sync() |
42 | data.close() | 42 | data.close() |
LDA/02-lda_split.py
1 | import gensim | File was deleted | |
2 | import os | ||
3 | import sys | ||
4 | import pickle | ||
5 | from gensim.models.ldamodel import LdaModel | ||
6 | from gensim.models.ldamulticore import LdaMulticore | ||
7 | from collections import Counter | ||
8 | import numpy as np | ||
9 | import codecs | ||
10 | import shelve | ||
11 | import logging | ||
12 | |||
13 | def calc_perp(in_dir,train): | ||
14 | name = in_dir.split("/")[-1] | ||
15 | # s40_it1_sw50_a0.01_e0.1_p6_c1000 | ||
16 | sw_size = int(name.split("_")[2][2:]) | ||
17 | |||
18 | logging.warning(" go {} ".format(name)) | ||
19 | |||
20 | |||
21 | logging.warning("Redo Vocab and stop") | ||
22 | asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) | ||
23 | trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) | ||
24 | asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] | ||
25 | trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] | ||
26 | stop_words=set(asr_sw) | set(trs_sw) | ||
27 | |||
28 | logging.warning("TRS to be done") | ||
29 | entry = Query() | ||
30 | value=db.search(entry.name == name) | ||
31 | if len(value) > 0 : | ||
32 | logging.warning("{} already done".format(name)) | ||
33 | return | ||
34 | |||
35 | dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] | ||
36 | lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir)) | ||
37 | perp_trs = lda_trs.log_perplexity(dev_trs) | ||
38 | logging.warning("ASR to be done") | ||
39 | dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] | ||
40 | lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir)) | ||
41 | perp_asr = lda_asr.log_perplexity(dev_asr) | ||
42 | logging.warning("ASR saving") | ||
43 | res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs} | ||
44 | return res_dict | ||
45 | |||
46 | |||
47 | |||
48 | |||
49 | def train_lda(out_dir,train,name,size,it,sw_size,alpha,eta,passes,chunk): | ||
50 | output_dir = "{}/s{}_it{}_sw{}_a{}_e{}_p{}_c{}".format(out_dir,size,it,sw_size,alpha,eta,passes,chunk) | ||
51 | os.mkdir(output_dir) | ||
52 | logging.info(output_dir+" to be done") | ||
53 | asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) | ||
54 | trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) | ||
55 | asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] | ||
56 | trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] | ||
57 | stop_words=set(asr_sw) | set(trs_sw) | ||
58 | |||
59 | logging.info("TRS to be done") | ||
60 | |||
61 | lda_trs = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=1000,iterations=it) | ||
62 | |||
63 | logging.info("ASR to be done") | ||
64 | lda_asr = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=1000,iterations=it) | ||
65 | |||
66 | #logger.info("ASR saving") | ||
67 | #lda_asr.save("{}/lda_asr.model".format(output_dir,name,size,it)) | ||
68 | #lda_trs.save("{}/lda_trs.model".format(output_dir,name,size,it)) | ||
69 | |||
70 | |||
71 | out_file_asr=codecs.open("{}/asr_wordTopic.txt".format(output_dir),"w","utf-8") | ||
72 | out_file_trs=codecs.open("{}/trs_wordTopic.txt".format(output_dir),"w","utf-8") | ||
73 | |||
74 | dico = train["vocab"] | ||
75 | print >>out_file_asr, ",\t".join( [ dico[x] for x in range(len(train["vocab"]))]) | ||
76 | for line in lda_asr.expElogbeta: | ||
77 | nline = line / np.sum(line) | ||
78 | print >>out_file_asr, ",\t".join( str(x) for x in nline) | ||
79 | out_file_asr.close() | ||
80 | |||
81 | print >>out_file_trs, ",\t".join( [ dico[x] for x in range(len(train["vocab"]))]) | ||
82 | for line in lda_trs.expElogbeta: | ||
83 | nline = line / np.sum(line) | ||
84 | print >>out_file_trs, ",\t".join( str(x) for x in nline) | ||
85 | out_file_trs.close() | ||
86 | |||
87 | K = lda_asr.num_topics | ||
88 | topicWordProbMat = lda_asr.print_topics(K,10) | ||
89 | out_file_asr=codecs.open("{}/asr_best10.txt".format(output_dir),"w","utf-8") | ||
90 | for i in topicWordProbMat: | ||
91 | print >>out_file_asr,i | ||
92 | out_file_asr.close() | ||
93 | |||
94 | K = lda_trs.num_topics | ||
95 | topicWordProbMat = lda_trs.print_topics(K,10) | ||
96 | out_file_trs=codecs.open("{}/trs_best10.txt".format(output_dir),"w","utf-8") | ||
97 | for i in topicWordProbMat: | ||
98 | print >>out_file_trs,i | ||
99 | out_file_trs.close() | ||
100 | |||
101 | if __name__ == "__main__": | ||
102 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) | ||
103 | |||
104 | input_shelve = sys.argv[1] | ||
105 | output_dir = sys.argv[2] | ||
106 | size = [ int(x) for x in sys.argv[3].split("_")] | ||
107 | workers = int(sys.argv[4]) | ||
108 | name = sys.argv[5] | ||
109 | it = [ int(x) for x in sys.argv[6].split("_")] | ||
110 | sw_size = [ int(x) for x in sys.argv[7].split("_")] | ||
111 | alpha = ["auto" , "symmetric"] + [ float(x) for x in sys.argv[8].split("_")] | ||
112 | eta = ["auto"] + [ float(x) for x in sys.argv[9].split("_")] | ||
113 | passes = [ int(x) for x in sys.argv[10].split("_")] | ||
114 | chunk = [ int(x) for x in sys.argv[11].split("_")] | ||
115 | |||
116 | #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir))) | ||
117 | train = shelve.open(input_shelve) | ||
118 | out_dir = "{}/{}".format(output_dir,name) | ||
119 | os.mkdir(out_dir) | ||
120 | |||
121 | for s in size: | ||
122 | for i in it : | ||
123 | for sw in sw_size: | ||
124 | for a in alpha: | ||
125 | for e in eta: | ||
126 | for p in passes: | ||
127 | for c in chunk: | ||
128 | train_lda(out_dir,train,name,s,i,sw,a,e,p,c) | ||
129 | 1 | import gensim | |
130 | 2 | import os |
LDA/02b-lda_order.py
1 | import gensim | File was deleted | |
2 | import os | ||
3 | import sys | ||
4 | import pickle | ||
5 | from gensim.models.ldamodel import LdaModel | ||
6 | from gensim.models.ldamulticore import LdaMulticore | ||
7 | from collections import Counter | ||
8 | import numpy as np | ||
9 | import codecs | ||
10 | import shelve | ||
11 | import logging | ||
12 | import dill | ||
13 | from tinydb import TinyDB, where, Query | ||
14 | import time | ||
15 | from joblib import Parallel, delayed | ||
16 | |||
17 | def calc_perp(models,train): | ||
18 | |||
19 | |||
20 | stop_words=models[1] | ||
21 | name = models[0] | ||
22 | |||
23 | logging.warning(" go {} ".format(name)) | ||
24 | logging.warning("TRS to be done") | ||
25 | entry = Query() | ||
26 | value=db.search(entry.name == name) | ||
27 | if len(value) > 0 : | ||
28 | logging.warning("{} already done".format(name)) | ||
29 | return | ||
30 | |||
31 | dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] | ||
32 | lda_trs = models[2] | ||
33 | perp_trs = lda_trs.log_perplexity(dev_trs) | ||
34 | |||
35 | logging.warning("ASR to be done") | ||
36 | dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] | ||
37 | lda_asr = models[5] | ||
38 | perp_asr = lda_asr.log_perplexity(dev_asr) | ||
39 | logging.warning("ASR saving") | ||
40 | res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs } | ||
41 | return res_dict | ||
42 | |||
43 | |||
44 | |||
45 | |||
46 | def train_lda(out_dir,train,size,it,sw_size,alpha,eta,passes,chunk): | ||
47 | name = "s{}_it{}_sw{}_a{}_e{}_p{}_c{}".format(size,it,sw_size,alpha,eta,passes,chunk) | ||
48 | logging.warning(name) | ||
49 | deep_out_dir = out_dir+"/"+name | ||
50 | if os.path.isdir(deep_out_dir): | ||
51 | logging.error(name+" already done") | ||
52 | return | ||
53 | logging.warning(name+" to be done") | ||
54 | asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) | ||
55 | trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) | ||
56 | asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] | ||
57 | trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] | ||
58 | stop_words=set(asr_sw) | set(trs_sw) | ||
59 | |||
60 | logging.warning("TRS to be done") | ||
61 | |||
62 | lda_trs = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=chunk,iterations=it,alpha=alpha,eta=eta,passes=passes) | ||
63 | |||
64 | logging.warning("ASR to be done") | ||
65 | lda_asr = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=chunk,iterations=it,alpha=alpha,eta=eta,passes=passes) | ||
66 | |||
67 | dico = train["vocab"] | ||
68 | word_list = [ dico[x] for x in range(len(train["vocab"]))] | ||
69 | asr_probs = [] | ||
70 | for line in lda_asr.expElogbeta: | ||
71 | nline = line / np.sum(line) | ||
72 | asr_probs.append([ str(x) for x in nline]) | ||
73 | trs_probs = [] | ||
74 | for line in lda_trs.expElogbeta: | ||
75 | nline = line / np.sum(line) | ||
76 | trs_probs.append([str(x) for x in nline]) | ||
77 | |||
78 | K = lda_asr.num_topics | ||
79 | topicWordProbMat_asr = lda_asr.print_topics(K,10) | ||
80 | |||
81 | K = lda_trs.num_topics | ||
82 | topicWordProbMat_trs = lda_trs.print_topics(K,10) | ||
83 | os.mkdir(deep_out_dir) | ||
84 | dill.dump([x for x in stop_words],open(deep_out_dir+"/stopwords.dill","w")) | ||
85 | lda_asr.save(deep_out_dir+"/lda_asr.model") | ||
86 | lda_trs.save(deep_out_dir+"/lda_trs.model") | ||
87 | dill.dump([x for x in asr_probs],open(deep_out_dir+"/lda_asr_probs.dill","w")) | ||
88 | dill.dump([x for x in trs_probs],open(deep_out_dir+"/lda_trs_probs.dill","w")) | ||
89 | |||
90 | return [name, stop_words, lda_asr , asr_probs , topicWordProbMat_asr, lda_trs, trs_probs, topicWordProbMat_trs] | ||
91 | |||
92 | def train_one(name,train,s,i,sw,a,e,p,c): | ||
93 | st=time.time() | ||
94 | logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]])) | ||
95 | models = train_lda(name,train,s,i,sw,a,e,p,c) | ||
96 | if models: | ||
97 | m = calc_perp(models,train) | ||
98 | #dill.dump(models,open("{}/{}.dill".format(name,models[0]),"wb")) | ||
99 | else : | ||
100 | m = None | ||
101 | e = time.time() | ||
102 | logging.warning("fin en : {}".format(e-st)) | ||
103 | return m | ||
104 | |||
105 | |||
106 | |||
107 | |||
108 | if __name__ == "__main__": | ||
109 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) | ||
110 | |||
111 | input_shelve = sys.argv[1] | ||
112 | db_path = sys.argv[2] | ||
113 | size = [ int(x) for x in sys.argv[3].split("_")] | ||
114 | workers = int(sys.argv[4]) | ||
115 | name = sys.argv[5] | ||
116 | it = [ int(x) for x in sys.argv[6].split("_")] | ||
117 | sw_size = [ int(x) for x in sys.argv[7].split("_")] | ||
118 | if sys.argv[8] != "None" : | ||
119 | alpha = [ "symmetric", "auto" ] + [ float(x) for x in sys.argv[8].split("_")] | ||
120 | eta = ["auto"] + [ float(x) for x in sys.argv[9].split("_")] | ||
121 | else : | ||
122 | alpha = ["symmetric"] | ||
123 | eta = ["auto"] | ||
124 | passes = [ int(x) for x in sys.argv[10].split("_")] | ||
125 | chunk = [ int(x) for x in sys.argv[11].split("_")] | ||
126 | |||
127 | #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir))) | ||
128 | train = shelve.open(input_shelve) | ||
129 | try : | ||
130 | os.mkdir(name) | ||
131 | except : | ||
132 | logging.warning(" folder already existe " ) | ||
133 | db = TinyDB(db_path) | ||
134 | nb_model = len(passes) * len(chunk) * len(it) * len(sw_size) * len(alpha) * len(eta) * len(size) | ||
135 | logging.warning(" hey will train {} models ".format(nb_model)) | ||
136 | |||
137 | args_list=[] | ||
138 | for p in passes: | ||
139 | for c in chunk: | ||
140 | for i in it : | ||
141 | for sw in sw_size: | ||
142 | for a in alpha: | ||
143 | for e in eta: | ||
144 | for s in size: | ||
145 | args_list.append((name,train,s,i,sw,a,e,p,c)) | ||
146 | res_list= Parallel(n_jobs=15)(delayed(train_one)(*args) for args in args_list) | ||
147 | for m in res_list : | ||
148 | db.insert(m) | ||
149 | |||
150 | 1 | import gensim |
LDA/04b-mini_ae.py
1 | 1 | ||
2 | # coding: utf-8 | 2 | # coding: utf-8 |
3 | 3 | ||
4 | # In[2]: | 4 | # In[2]: |
5 | 5 | ||
6 | # Import | 6 | # Import |
7 | import gensim | 7 | import gensim |
8 | from scipy import sparse | 8 | from scipy import sparse |
9 | import itertools | 9 | import itertools |
10 | from sklearn import preprocessing | 10 | from sklearn import preprocessing |
11 | from keras.models import Sequential | 11 | from keras.models import Sequential |
12 | from keras.optimizers import SGD,Adam | 12 | from keras.optimizers import SGD,Adam |
13 | from mlp import * | 13 | from mlp import * |
14 | import mlp | 14 | import mlp |
15 | import sklearn.metrics | 15 | import sklearn.metrics |
16 | import shelve | 16 | import shelve |
17 | import pickle | 17 | import pickle |
18 | from utils import * | 18 | from utils import * |
19 | import sys | 19 | import sys |
20 | import os | 20 | import os |
21 | import json | 21 | import json |
22 | # In[4]: | 22 | # In[4]: |
23 | 23 | ||
24 | sparse_model=shelve.open("{}".format(sys.argv[2])) | 24 | sparse_model=shelve.open("{}".format(sys.argv[2])) |
25 | in_dir = sys.argv[1] | 25 | in_dir = sys.argv[1] |
26 | infer_model=shelve.open("{}/infer.shelve".format(in_dir)) | 26 | infer_model=shelve.open("{}/infer.shelve".format(in_dir)) |
27 | #['ASR', 'TRS', 'LABEL'] | 27 | #['ASR', 'TRS', 'LABEL'] |
28 | # In[6]: | 28 | # In[6]: |
29 | ASR=sparse_model["ASR_wid"] | 29 | ASR=sparse_model["ASR_wid"] |
30 | TRS=sparse_model["TRS_wid"] | 30 | TRS=sparse_model["TRS_wid"] |
31 | LABEL=sparse_model["LABEL"] | 31 | LABEL=sparse_model["LABEL"] |
32 | 32 | ||
33 | 33 | ||
34 | hidden_size=40 | 34 | hidden_size=40 |
35 | input_activation="tanh" | 35 | input_activation="tanh" |
36 | out_activation="tanh" | 36 | out_activation="tanh" |
37 | loss="mse" | 37 | loss="mse" |
38 | epochs=500 | 38 | epochs=500 |
39 | batch=1 | 39 | batch=1 |
40 | patience=60 | 40 | patience=60 |
41 | do_do=False | 41 | do_do=False |
42 | sgd = Adam(lr=0.00001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | 42 | sgd = Adam(lr=0.00001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) |
43 | try : | 43 | try : |
44 | sgd_repr=sgd.get_config()["name"] | 44 | sgd_repr=sgd.get_config()["name"] |
45 | except AttributeError : | 45 | except AttributeError : |
46 | sgd_repr=sgd | 46 | sgd_repr=sgd |
47 | 47 | ||
48 | params={ "h1" : hidden_size, | 48 | params={ "h1" : hidden_size, |
49 | "inside_activation" : input_activation, | 49 | "inside_activation" : input_activation, |
50 | "out_activation" : out_activation, | 50 | "out_activation" : out_activation, |
51 | "do_dropout": do_do, | 51 | "do_dropout": do_do, |
52 | "loss" : loss, | 52 | "loss" : loss, |
53 | "epochs" : epochs , | 53 | "epochs" : epochs , |
54 | "batch_size" : batch, | 54 | "batch_size" : batch, |
55 | "patience" : patience, | 55 | "patience" : patience, |
56 | "sgd" : sgd_repr} | 56 | "sgd" : sgd_repr} |
57 | name = "_".join([ str(x) for x in params.values()]) | 57 | name = "_".join([ str(x) for x in params.values()]) |
58 | try: | 58 | try: |
59 | os.mkdir("{}/{}".format(in_dir,name)) | 59 | os.mkdir("{}/{}".format(in_dir,name)) |
60 | except: | 60 | except: |
61 | pass | 61 | pass |
62 | db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True) | 62 | db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True) |
63 | db["params"] = params | 63 | db["params"] = params |
64 | db["LABEL"]=LABEL | 64 | db["LABEL"]=LABEL |
65 | # | 65 | # |
66 | json.dump(params, | 66 | json.dump(params, |
67 | open("{}/{}/ae_model.json".format(in_dir,name),"w"), | 67 | open("{}/{}/ae_model.json".format(in_dir,name),"w"), |
68 | indent=4) | 68 | indent=4) |
69 | 69 | ||
70 | keys = ["ASR","TRS"] | 70 | keys = ["ASR","TRS"] |
71 | 71 | ||
72 | mlp_h = [ 40 , 25 , 40] | 72 | mlp_h = [ 512 , 1024 , 2048] |
73 | mlp_loss ="categorical_crossentropy" | 73 | mlp_loss ="categorical_crossentropy" |
74 | mlp_dropouts = [0,0,0,0] | 74 | mlp_dropouts = [0,0,0,0] |
75 | mlp_sgd = Adam(0.0001) | 75 | mlp_sgd = Adam(0.0001) |
76 | mlp_epochs = 200 | 76 | mlp_epochs = 200 |
77 | mlp_batch_size = 8 | 77 | mlp_batch_size = 8 |
78 | 78 | ||
79 | db["AE"] = {} | 79 | db["AE"] = {} |
80 | for mod in keys : | 80 | for mod in keys : |
81 | res=train_ae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"],[params["h1"]],patience = params["patience"],sgd=sgd,in_activation="tanh",out_activation="tanh",loss=loss,epochs=epochs,batch_size=batch,verbose=0) | 81 | res=train_ae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"],[params["h1"]],patience = params["patience"],sgd=sgd,in_activation="tanh",out_activation="tanh",loss=loss,epochs=epochs,batch_size=batch,verbose=0) |
82 | mlp_res_list=[] | 82 | mlp_res_list=[] |
83 | for layer in res : | 83 | for layer in res : |
84 | mlp_res_list.append(train_mlp(layer[0],LABEL["TRAIN"],layer[1],LABEL["DEV"],layer[2],LABEL["TEST"],mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size,fit_verbose=0)) | 84 | mlp_res_list.append(train_mlp(layer[0],LABEL["TRAIN"],layer[1],LABEL["DEV"],layer[2],LABEL["TEST"],mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size,fit_verbose=0)) |
85 | db["AE"][mod]=mlp_res_list | 85 | db["AE"][mod]=mlp_res_list |
86 | 86 | ||
87 | mod = "ASR" | 87 | mod = "ASR" |
88 | mod2= "TRS" | 88 | mod2= "TRS" |
89 | mlp_res_list=[] | 89 | mlp_res_list=[] |
90 | 90 | ||
91 | res = train_ae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"],[params["h1"]],dropouts=[0],patience = params["patience"],sgd=sgd,in_activation="tanh",out_activation="tanh",loss=loss,epochs=epochs,batch_size=batch,y_train=infer_model["LDA"][mod]["TRAIN"],y_dev=infer_model["LDA"][mod2]["DEV"],y_test=infer_model["LDA"][mod2]["TEST"]) | 91 | res = train_ae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"],[params["h1"]],dropouts=[0],patience = params["patience"],sgd=sgd,in_activation="tanh",out_activation="tanh",loss=loss,epochs=epochs,batch_size=batch,y_train=infer_model["LDA"][mod]["TRAIN"],y_dev=infer_model["LDA"][mod2]["DEV"],y_test=infer_model["LDA"][mod2]["TEST"]) |
92 | for layer in res : | 92 | for layer in res : |
93 | mlp_res_list.append(train_mlp(layer[0],LABEL["TRAIN"],layer[1],LABEL["DEV"],layer[2],LABEL["TEST"],mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size,fit_verbose=0)) | 93 | mlp_res_list.append(train_mlp(layer[0],LABEL["TRAIN"],layer[1],LABEL["DEV"],layer[2],LABEL["TEST"],mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size,fit_verbose=0)) |
94 | 94 | ||
95 | db["AE"]["SPE"] = mlp_res_list | 95 | db["AE"]["SPE"] = mlp_res_list |
96 | 96 | ||
97 | 97 | ||
98 | db.close() | 98 | db.close() |
99 | 99 |
LDA/04e-mm_vae.py
1 | 1 | ||
2 | # coding: utf-8 | 2 | # coding: utf-8 |
3 | import gensim | 3 | import gensim |
4 | from scipy import sparse | 4 | from scipy import sparse |
5 | import itertools | 5 | import itertools |
6 | from sklearn import preprocessing | 6 | from sklearn import preprocessing |
7 | from keras.models import Sequential | 7 | from keras.models import Sequential |
8 | from keras.optimizers import SGD,Adam | 8 | from keras.optimizers import SGD,Adam |
9 | from mlp import * | 9 | from mlp import * |
10 | from vae import * | 10 | from vae import * |
11 | import sklearn.metrics | 11 | import sklearn.metrics |
12 | import shelve | 12 | import shelve |
13 | import pickle | 13 | import pickle |
14 | from utils import * | 14 | from utils import * |
15 | import sys | 15 | import sys |
16 | import os | 16 | import os |
17 | import json | 17 | import json |
18 | # In[4]: | 18 | # In[4]: |
19 | 19 | ||
20 | infer_model=shelve.open("{}".format(sys.argv[2])) | 20 | infer_model=shelve.open("{}".format(sys.argv[2])) |
21 | in_dir = sys.argv[1] | 21 | in_dir = sys.argv[1] |
22 | #['ASR', 'TRS', 'LABEL'] | 22 | #['ASR', 'TRS', 'LABEL'] |
23 | # In[6]: | 23 | # In[6]: |
24 | if len(sys.argv) > 4 : | 24 | if len(sys.argv) > 4 : |
25 | features_key = sys.argv[4] | 25 | features_key = sys.argv[4] |
26 | else : | 26 | else : |
27 | features_key = "LDA" | 27 | features_key = "LDA" |
28 | 28 | ||
29 | save_projection = True | 29 | save_projection = True |
30 | json_conf =json.load(open(sys.argv[3])) | 30 | json_conf =json.load(open(sys.argv[3])) |
31 | vae_conf = json_conf["vae"] | 31 | vae_conf = json_conf["vae"] |
32 | 32 | ||
33 | hidden_size= vae_conf["hidden_size"] | 33 | hidden_size= vae_conf["hidden_size"] |
34 | input_activation=vae_conf["input_activation"] | 34 | input_activation=vae_conf["input_activation"] |
35 | output_activation=vae_conf["output_activation"] | 35 | output_activation=vae_conf["output_activation"] |
36 | epochs=vae_conf["epochs"] | 36 | epochs=vae_conf["epochs"] |
37 | batch=vae_conf["batch"] | 37 | batch=vae_conf["batch"] |
38 | patience=vae_conf["patience"] | 38 | patience=vae_conf["patience"] |
39 | latent_dim = vae_conf["latent"] | 39 | latent_dim = vae_conf["latent"] |
40 | try: | 40 | try: |
41 | k = vae_conf["sgd"] | 41 | k = vae_conf["sgd"] |
42 | if vae_conf["sgd"]["name"] == "adam": | 42 | if vae_conf["sgd"]["name"] == "adam": |
43 | sgd = Adam(lr=vae_conf["sgd"]["lr"])#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | 43 | sgd = Adam(lr=vae_conf["sgd"]["lr"])#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) |
44 | elif vae_conf["sgd"]["name"] == "sgd": | 44 | elif vae_conf["sgd"]["name"] == "sgd": |
45 | sgd = SGD(lr=vae_conf["sgd"]["lr"]) | 45 | sgd = SGD(lr=vae_conf["sgd"]["lr"]) |
46 | except: | 46 | except: |
47 | sgd = vae_conf["sgd"] | 47 | sgd = vae_conf["sgd"] |
48 | 48 | ||
49 | mlp_conf = json_conf["mlp"] | 49 | mlp_conf = json_conf["mlp"] |
50 | mlp_h = mlp_conf["hidden_size"] | 50 | mlp_h = mlp_conf["hidden_size"] |
51 | mlp_loss = mlp_conf["loss"] | 51 | mlp_loss = mlp_conf["loss"] |
52 | mlp_dropouts = mlp_conf["do"] | 52 | mlp_dropouts = mlp_conf["do"] |
53 | mlp_epochs = mlp_conf["epochs"] | 53 | mlp_epochs = mlp_conf["epochs"] |
54 | mlp_batch_size = mlp_conf["batch"] | 54 | mlp_batch_size = mlp_conf["batch"] |
55 | mlp_input_activation=mlp_conf["input_activation"] | 55 | mlp_input_activation=mlp_conf["input_activation"] |
56 | mlp_output_activation=mlp_conf["output_activation"] | 56 | mlp_output_activation=mlp_conf["output_activation"] |
57 | 57 | ||
58 | 58 | ||
59 | try: | 59 | try: |
60 | k = mlp_conf["sgd"] | 60 | k = mlp_conf["sgd"] |
61 | if mlp_conf["sgd"]["name"] == "adam": | 61 | if mlp_conf["sgd"]["name"] == "adam": |
62 | mlp_sgd = Adam(lr=mlp_conf["sgd"]["lr"])#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | 62 | mlp_sgd = Adam(lr=mlp_conf["sgd"]["lr"])#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) |
63 | elif mlp_conf["sgd"]["name"] == "sgd": | 63 | elif mlp_conf["sgd"]["name"] == "sgd": |
64 | mlp_sgd = SGD(lr=mlp_conf["sgd"]["lr"]) | 64 | mlp_sgd = SGD(lr=mlp_conf["sgd"]["lr"]) |
65 | except: | 65 | except: |
66 | mlp_sgd = mlp_conf["sgd"] | 66 | mlp_sgd = mlp_conf["sgd"] |
67 | 67 | ||
68 | 68 | ||
69 | name = json_conf["name"] | 69 | name = json_conf["name"] |
70 | 70 | ||
71 | try : | 71 | try : |
72 | print "make folder " | 72 | print "make folder " |
73 | os.mkdir("{}/{}".format(in_dir,name)) | 73 | os.mkdir("{}/{}".format(in_dir,name)) |
74 | except: | 74 | except: |
75 | print "folder not maked" | 75 | print "folder not maked" |
76 | pass | 76 | pass |
77 | 77 | ||
78 | 78 | ||
79 | db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True) | 79 | db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True) |
80 | db["LABEL"]=infer_model["LABEL"] | 80 | db["LABEL"]=infer_model["LABEL"] |
81 | # | 81 | # |
82 | 82 | ||
83 | 83 | ||
84 | keys = infer_model[features_key].keys() | 84 | keys = infer_model[features_key].keys() |
85 | 85 | ||
86 | db["VAE"] = {} | 86 | db["VAE"] = {} |
87 | db[features_key] = {} | 87 | db[features_key] = {} |
88 | for mod in keys : | 88 | for mod in keys : |
89 | #print mod | 89 | #print mod |
90 | db[features_key][mod] = train_mlp(infer_model[features_key][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"], | 90 | db[features_key][mod] = train_mlp(infer_model[features_key][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"], |
91 | infer_model[features_key][mod]["DEV"],infer_model["LABEL"][mod]["DEV"], | 91 | infer_model[features_key][mod]["DEV"],infer_model["LABEL"][mod]["DEV"], |
92 | infer_model[features_key][mod]["TEST"],infer_model["LABEL"][mod]["TEST"], | 92 | infer_model[features_key][mod]["TEST"],infer_model["LABEL"][mod]["TEST"], |
93 | mlp_h ,sgd=mlp_sgd, | 93 | mlp_h ,sgd=mlp_sgd, |
94 | epochs=mlp_epochs, | 94 | epochs=mlp_epochs, |
95 | batch_size=mlp_batch_size, | 95 | batch_size=mlp_batch_size, |
96 | input_activation=input_activation, | 96 | input_activation=input_activation, |
97 | output_activation=mlp_output_activation, | 97 | output_activation=mlp_output_activation, |
98 | dropouts=mlp_dropouts, | 98 | dropouts=mlp_dropouts, |
99 | fit_verbose=0) | 99 | fit_verbose=0) |
100 | 100 | ||
101 | res=train_vae(infer_model[features_key][mod]["TRAIN"],infer_model[features_key][mod]["DEV"],infer_model[features_key][mod]["TEST"], | 101 | res=train_vae(infer_model[features_key][mod]["TRAIN"],infer_model[features_key][mod]["DEV"],infer_model[features_key][mod]["TEST"], |
102 | hidden_size=hidden_size[0], | 102 | hidden_size=hidden_size[0], |
103 | latent_dim=latent_dim,sgd=sgd, | 103 | latent_dim=latent_dim,sgd=sgd, |
104 | input_activation=input_activation,output_activation=output_activation, | 104 | input_activation=input_activation,output_activation=output_activation, |
105 | nb_epochs=epochs,batch_size=batch) | 105 | nb_epochs=epochs,batch_size=batch) |
106 | mlp_res_list=[] | 106 | mlp_res_list=[] |
107 | for nb,layer in enumerate(res) : | 107 | for nb,layer in enumerate(res) : |
108 | if save_projection: | 108 | if save_projection: |
109 | pd = pandas.DataFrame(layer[0]) | 109 | pd = pandas.DataFrame(layer[0]) |
110 | col_count = (pd.sum(axis=0) != 0) | 110 | col_count = (pd.sum(axis=0) != 0) |
111 | pd = pd.loc[:,cyyol_count] | 111 | pd = pd.loc[:,col_count] |
112 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"TRAIN") | 112 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"TRAIN") |
113 | pd = pandas.DataFrame(layer[1]) | 113 | pd = pandas.DataFrame(layer[1]) |
114 | pd = pd.loc[:,col_count] | 114 | pd = pd.loc[:,col_count] |
115 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"DEV") | 115 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"DEV") |
116 | pd = pandas.DataFrame(layer[2]) | 116 | pd = pandas.DataFrame(layer[2]) |
117 | pd = pd.loc[:,col_count] | 117 | pd = pd.loc[:,col_count] |
118 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"TEST") | 118 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"TEST") |
119 | del pd | 119 | del pd |
120 | 120 | ||
121 | mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], | 121 | mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], |
122 | layer[1],infer_model["LABEL"][mod]["DEV"], | 122 | layer[1],infer_model["LABEL"][mod]["DEV"], |
123 | layer[2],infer_model["LABEL"][mod]["TEST"], | 123 | layer[2],infer_model["LABEL"][mod]["TEST"], |
124 | mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, | 124 | mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, |
125 | output_activation=mlp_output_activation, | 125 | output_activation=mlp_output_activation, |
126 | input_activation=input_activation, | 126 | input_activation=input_activation, |
127 | batch_size=mlp_batch_size,fit_verbose=0)) | 127 | batch_size=mlp_batch_size,fit_verbose=0)) |
128 | db["VAE"][mod]=mlp_res_list | 128 | db["VAE"][mod]=mlp_res_list |
129 | 129 | ||
130 | if "ASR" in keys and "TRS" in keys : | 130 | if "ASR" in keys and "TRS" in keys : |
131 | mod = "ASR" | 131 | mod = "ASR" |
132 | mod2= "TRS" | 132 | mod2= "TRS" |
133 | mlp_res_list=[] | 133 | mlp_res_list=[] |
134 | 134 | ||
135 | res = train_vae(infer_model[features_key][mod]["TRAIN"], | 135 | res = train_vae(infer_model[features_key][mod]["TRAIN"], |
136 | infer_model[features_key][mod]["DEV"], | 136 | infer_model[features_key][mod]["DEV"], |
137 | infer_model[features_key][mod]["TEST"], | 137 | infer_model[features_key][mod]["TEST"], |
138 | hidden_size=hidden_size[0], | 138 | hidden_size=hidden_size[0], |
139 | sgd=sgd,input_activation=input_activation,output_activation=output_activation, | 139 | sgd=sgd,input_activation=input_activation,output_activation=output_activation, |
140 | latent_dim=latent_dim, | 140 | latent_dim=latent_dim, |
141 | nb_epochs=epochs, | 141 | nb_epochs=epochs, |
142 | batch_size=batch, | 142 | batch_size=batch, |
143 | y_train=infer_model[features_key][mod2]["TRAIN"], | 143 | y_train=infer_model[features_key][mod2]["TRAIN"], |
144 | y_dev=infer_model[features_key][mod2]["DEV"], | 144 | y_dev=infer_model[features_key][mod2]["DEV"], |
145 | y_test=infer_model[features_key][mod2]["TEST"]) | 145 | y_test=infer_model[features_key][mod2]["TEST"]) |
146 | 146 | ||
147 | for nb,layer in enumerate(res) : | 147 | for nb,layer in enumerate(res) : |
148 | if save_projection: | 148 | if save_projection: |
149 | pd = pandas.DataFrame(layer[0]) | 149 | pd = pandas.DataFrame(layer[0]) |
150 | col_count = (pd.sum(axis=0) != 0) | 150 | col_count = (pd.sum(axis=0) != 0) |
151 | pd = pd.loc[:,col_count] | 151 | pd = pd.loc[:,col_count] |
152 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"TRAIN") | 152 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"TRAIN") |
153 | pd = pandas.DataFrame(layer[1]) | 153 | pd = pandas.DataFrame(layer[1]) |
154 | pd = pd.loc[:,col_count] | 154 | pd = pd.loc[:,col_count] |
155 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"DEV") | 155 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"DEV") |
156 | pd = pandas.DataFrame(layer[2]) | 156 | pd = pandas.DataFrame(layer[2]) |
157 | pd = pd.loc[:,col_count] | 157 | pd = pd.loc[:,col_count] |
158 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"TEST") | 158 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"TEST") |
159 | del pd | 159 | del pd |
160 | 160 | ||
161 | mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], | 161 | mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], |
162 | layer[1],infer_model["LABEL"][mod]["DEV"], | 162 | layer[1],infer_model["LABEL"][mod]["DEV"], |
163 | layer[2],infer_model["LABEL"][mod]["TEST"], | 163 | layer[2],infer_model["LABEL"][mod]["TEST"], |
164 | mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, | 164 | mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, |
165 | output_activation=mlp_output_activation, | 165 | output_activation=mlp_output_activation, |
166 | input_activation=input_activation, | 166 | input_activation=input_activation, |
167 | batch_size=mlp_batch_size,fit_verbose=0)) | 167 | batch_size=mlp_batch_size,fit_verbose=0)) |
168 | 168 | ||
169 | db["VAE"]["SPE"] = mlp_res_list | 169 | db["VAE"]["SPE"] = mlp_res_list |
170 | 170 | ||
171 | db.sync() | 171 | db.sync() |
172 | db.close() | 172 | db.close() |
173 | 173 |
LDA/run.sh
1 | python 00-prepross.py | File was deleted | |
2 | python 02-lda_split.py DECODA_list_wid.shelve output_v1/ 100 12 test2 1 400 | ||
3 | python 03-mono_perplex.py DECODA_list_wid.shelve output_v1/test2 output_v1/t2db.json | ||
4 | 1 | python 00-prepross.py |
LDA/vae.py
1 | '''This script demonstrates how to build a variational autoencoder with Keras. | 1 | '''This script demonstrates how to build a variational autoencoder with Keras. |
2 | Reference: "Auto-Encoding Variational Bayes" https://arxiv.org/abs/1312.6114 | 2 | Reference: "Auto-Encoding Variational Bayes" https://arxiv.org/abs/1312.6114 |
3 | ''' | 3 | ''' |
4 | 4 | ||
5 | import itertools | 5 | import itertools |
6 | import sys | 6 | import sys |
7 | import json | 7 | import json |
8 | 8 | ||
9 | import numpy as np | 9 | import numpy as np |
10 | import matplotlib.pyplot as plt | 10 | import matplotlib.pyplot as plt |
11 | from scipy import sparse | 11 | from scipy import sparse |
12 | import scipy.io | 12 | import scipy.io |
13 | 13 | ||
14 | from keras.layers import Input, Dense, Lambda | 14 | from keras.layers import Input, Dense, Lambda |
15 | from keras.models import Model | 15 | from keras.models import Model |
16 | from keras import backend as K | 16 | from keras import backend as K |
17 | from keras import objectives | 17 | from keras import objectives |
18 | from keras.datasets import mnist | 18 | from keras.datasets import mnist |
19 | from keras.callbacks import EarlyStopping,Callback | 19 | from keras.callbacks import EarlyStopping,Callback |
20 | 20 | ||
21 | import pandas | 21 | import pandas |
22 | import shelve | 22 | import shelve |
23 | import pickle | 23 | import pickle |
24 | 24 | ||
25 | 25 | ||
26 | class ZeroStopping(Callback): | 26 | class ZeroStopping(Callback): |
27 | '''Stop training when a monitored quantity has stopped improving. | 27 | '''Stop training when a monitored quantity has stopped improving. |
28 | # Arguments | 28 | # Arguments |
29 | monitor: quantity to be monitored. | 29 | monitor: quantity to be monitored. |
30 | patience: number of epochs with no improvement | 30 | patience: number of epochs with no improvement |
31 | after which training will be stopped. | 31 | after which training will be stopped. |
32 | verbose: verbosity mode. | 32 | verbose: verbosity mode. |
33 | mode: one of {auto, min, max}. In 'min' mode, | 33 | mode: one of {auto, min, max}. In 'min' mode, |
34 | training will stop when the quantity | 34 | training will stop when the quantity |
35 | monitored has stopped decreasing; in 'max' | 35 | monitored has stopped decreasing; in 'max' |
36 | mode it will stop when the quantity | 36 | mode it will stop when the quantity |
37 | monitored has stopped increasing. | 37 | monitored has stopped increasing. |
38 | ''' | 38 | ''' |
39 | def __init__(self, monitor='val_loss', verbose=0, mode='auto', thresh = 0): | 39 | def __init__(self, monitor='val_loss', verbose=0, mode='auto', thresh = 0): |
40 | super(ZeroStopping, self).__init__() | 40 | super(ZeroStopping, self).__init__() |
41 | 41 | ||
42 | self.monitor = monitor | 42 | self.monitor = monitor |
43 | self.verbose = verbose | 43 | self.verbose = verbose |
44 | self.thresh = thresh # is a rythme | 44 | self.thresh = thresh # is a rythme |
45 | 45 | ||
46 | if mode not in ['auto', 'min', 'max']: | 46 | if mode not in ['auto', 'min', 'max']: |
47 | warnings.warn('EarlyStopping mode %s is unknown, ' | 47 | warnings.warn('EarlyStopping mode %s is unknown, ' |
48 | 'fallback to auto mode.' % (self.mode), | 48 | 'fallback to auto mode.' % (self.mode), |
49 | RuntimeWarning) | 49 | RuntimeWarning) |
50 | mode = 'auto' | 50 | mode = 'auto' |
51 | 51 | ||
52 | if mode == 'min': | 52 | if mode == 'min': |
53 | self.monitor_op = np.less | 53 | self.monitor_op = np.less |
54 | elif mode == 'max': | 54 | elif mode == 'max': |
55 | self.monitor_op = np.greater | 55 | self.monitor_op = np.greater |
56 | else: | 56 | else: |
57 | if 'acc' in self.monitor: | 57 | if 'acc' in self.monitor: |
58 | self.monitor_op = np.greater | 58 | self.monitor_op = np.greater |
59 | else: | 59 | else: |
60 | self.monitor_op = np.less | 60 | self.monitor_op = np.less |
61 | 61 | ||
62 | def on_epoch_end(self, epoch, logs={}): | 62 | def on_epoch_end(self, epoch, logs={}): |
63 | current = logs.get(self.monitor) | 63 | current = logs.get(self.monitor) |
64 | if current is None: | 64 | if current is None: |
65 | warnings.warn('Zero stopping requires %s available!' % | 65 | warnings.warn('Zero stopping requires %s available!' % |
66 | (self.monitor), RuntimeWarning) | 66 | (self.monitor), RuntimeWarning) |
67 | 67 | ||
68 | if self.monitor_op(current, self.thresh): | 68 | if self.monitor_op(current, self.thresh): |
69 | self.best = current | 69 | self.best = current |
70 | self.model.stop_training = True | 70 | self.model.stop_training = True |
71 | 71 | ||
72 | #batch_size = 16 | 72 | #batch_size = 16 |
73 | #original_dim = 784 | 73 | #original_dim = 784 |
74 | #latent_dim = 2 | 74 | #latent_dim = 2 |
75 | #intermediate_dim = 128 | 75 | #intermediate_dim = 128 |
76 | #epsilon_std = 0.01 | 76 | #epsilon_std = 0.01 |
77 | #nb_epoch = 40 | 77 | #nb_epoch = 40 |
78 | 78 | ||
79 | 79 | ||
80 | 80 | ||
81 | 81 | ||
82 | def train_vae(x_train,x_dev,x_test,y_train=None,y_dev=None,y_test=None,hidden_size=80,latent_dim=12,batch_size=8,nb_epochs=10,sgd="rmsprop",input_activation = "relu",output_activation = "sigmoid",epsilon_std=0.01): | 82 | def train_vae(x_train,x_dev,x_test,y_train=None,y_dev=None,y_test=None,hidden_size=80,latent_dim=12,batch_size=8,nb_epochs=10,sgd="rmsprop",input_activation = "relu",output_activation = "sigmoid",epsilon_std=0.01): |
83 | 83 | ||
84 | 84 | ||
85 | 85 | ||
86 | def sampling(args): | 86 | def sampling(args): |
87 | z_mean, z_log_std = args | 87 | z_mean, z_log_std = args |
88 | epsilon = K.random_normal(shape=(batch_size, latent_dim), | 88 | epsilon = K.random_normal(shape=(batch_size, latent_dim), |
89 | mean=0., std=epsilon_std) | 89 | mean=0., std=epsilon_std) |
90 | return z_mean + K.exp(z_log_std) * epsilon | 90 | return z_mean + K.exp(z_log_std) * epsilon |
91 | 91 | ||
92 | def vae_loss(x, x_decoded_mean): | 92 | def vae_loss(x, x_decoded_mean): |
93 | xent_loss = objectives.binary_crossentropy(x, x_decoded_mean) | 93 | xent_loss = objectives.binary_crossentropy(x, x_decoded_mean) |
94 | kl_loss = - 0.5 * K.mean(1 + z_log_std - K.square(z_mean) - K.exp(z_log_std), axis=-1) | 94 | kl_loss = - 0.5 * K.mean(1 + z_log_std - K.square(z_mean) - K.exp(z_log_std), axis=-1) |
95 | return xent_loss + kl_loss | 95 | return xent_loss + kl_loss |
96 | 96 | ||
97 | original_dim = x_train.shape[1] | 97 | original_dim = x_train.shape[1] |
98 | 98 | ||
99 | 99 | ||
100 | x = Input(batch_shape=(batch_size, original_dim)) | 100 | x = Input(batch_shape=(batch_size, original_dim)) |
101 | h = Dense(hidden_size, activation=input_activation)(x) | 101 | h = Dense(hidden_size, activation=input_activation)(x) |
102 | z_mean = Dense(latent_dim)(h) | 102 | z_mean = Dense(latent_dim)(h) |
103 | z_log_std = Dense(latent_dim)(h) | 103 | z_log_std = Dense(latent_dim)(h) |
104 | 104 | ||
105 | 105 | ||
106 | # note that "output_shape" isn't necessary with the TensorFlow backend | 106 | # note that "output_shape" isn't necessary with the TensorFlow backend |
107 | # so you could write `Lambda(sampling)([z_mean, z_log_std])` | 107 | # so you could write `Lambda(sampling)([z_mean, z_log_std])` |
108 | z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_std]) | 108 | z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_std]) |
109 | 109 | ||
110 | # we instantiate these layers separately so as to reuse them later | 110 | # we instantiate these layers separately so as to reuse them later |
111 | decoder_h = Dense(hidden_size, activation=input_activation) | 111 | decoder_h = Dense(hidden_size, activation=input_activation) |
112 | decoder_mean = Dense(original_dim, activation=output_activation) | 112 | decoder_mean = Dense(original_dim, activation=output_activation) |
113 | h_decoded = decoder_h(z) | 113 | h_decoded = decoder_h(z) |
114 | x_decoded_mean = decoder_mean(h_decoded) | 114 | x_decoded_mean = decoder_mean(h_decoded) |
115 | 115 | ||
116 | 116 | ||
117 | vae = Model(x, x_decoded_mean) | 117 | vae = Model(x, x_decoded_mean) |
118 | vae.compile(optimizer=sgd, loss=vae_loss) | 118 | vae.compile(optimizer=sgd, loss=vae_loss) |
119 | 119 | ||
120 | # train the VAE on MNIST digits | 120 | # train the VAE on MNIST digits |
121 | if y_train is None or y_dev is None or y_test is None : | 121 | if y_train is None or y_dev is None or y_test is None : |
122 | y_train = x_train | 122 | y_train = x_train |
123 | y_dev = x_dev | 123 | y_dev = x_dev |
124 | y_test = x_test | 124 | y_test = x_test |
125 | 125 | ||
126 | vae.fit(x_train, y_train, | 126 | vae.fit(x_train, y_train, |
127 | shuffle=True, | 127 | shuffle=True, |
128 | nb_epoch=nb_epochs, | 128 | nb_epoch=nb_epochs, |
129 | verbose = 1, | 129 | verbose = 1, |
130 | batch_size=batch_size, | 130 | batch_size=batch_size, |
131 | validation_data=(x_dev, y_dev), | 131 | validation_data=(x_dev, y_dev) |
132 | callbacks = [ZeroStopping(monitor='val_loss', thresh=0, verbose=0, mode='min')] | 132 | #callbacks = [ZeroStopping(monitor='val_loss', thresh=0, verbose=0, mode='min')] |
133 | ) | 133 | ) |
134 | 134 | ||
135 | # build a model to project inputs on the latent space | 135 | # build a model to project inputs on the latent space |
136 | encoder = Model(x, z_mean) | 136 | encoder = Model(x, z_mean) |
137 | pred_train = encoder.predict(x_train, batch_size=batch_size) | 137 | pred_train = encoder.predict(x_train, batch_size=batch_size) |
138 | pred_dev = encoder.predict(x_dev, batch_size=batch_size) | 138 | pred_dev = encoder.predict(x_dev, batch_size=batch_size) |
139 | pred_test = encoder.predict(x_test,batch_size=batch_size) | 139 | pred_test = encoder.predict(x_test,batch_size=batch_size) |
140 | return [ [ pred_train, pred_dev, pred_test ] ] | 140 | return [ [ pred_train, pred_dev, pred_test ] ] |
141 | # display a 2D plot of the digit classes in the latent space | 141 | # display a 2D plot of the digit classes in the latent space |
142 | #x_test_encoded = encoder.predict(x_test, batch_size=batch_size) | 142 | #x_test_encoded = encoder.predict(x_test, batch_size=batch_size) |
143 | # build a digit generator that can sample from the learned distribution | 143 | # build a digit generator that can sample from the learned distribution |
144 | #decoder_input = Input(shape=(latent_dim,)) | 144 | #decoder_input = Input(shape=(latent_dim,)) |
145 | #_h_decoded = decoder_h(decoder_input) | 145 | #_h_decoded = decoder_h(decoder_input) |
146 | #_x_decoded_mean = decoder_mean(_h_decoded) | 146 | #_x_decoded_mean = decoder_mean(_h_decoded) |
147 | #generator = Model(decoder_input, _x_decoded_mean) | 147 | #generator = Model(decoder_input, _x_decoded_mean) |
148 | #x_decoded = generator.predict(z_sample) | 148 | #x_decoded = generator.predict(z_sample) |
149 | 149 | ||
150 | 150 |