Commit d1012a7a1689588ac0d1e4a716497562663c14c2
1 parent
ee9023b1c9
Exists in
master
update LDA/.py
Showing 7 changed files with 8 additions and 289 deletions Inline Diff
LDA/00-mmf_make_features.py
| 1 | import sys | 1 | import sys |
| 2 | import os | 2 | import os |
| 3 | 3 | ||
| 4 | import pandas | 4 | import pandas |
| 5 | import numpy | 5 | import numpy |
| 6 | import shelve | 6 | import shelve |
| 7 | 7 | ||
| 8 | from sklearn.preprocessing import LabelBinarizer | 8 | from sklearn.preprocessing import LabelBinarizer |
| 9 | 9 | ||
| 10 | from utils import select_mmf as select | 10 | from utils import select_mmf as select |
| 11 | 11 | ||
| 12 | input_dir = sys.argv[1] # Dossier de premire niveau contient ASR et TRS | 12 | input_dir = sys.argv[1] # Dossier de premire niveau contient ASR et TRS |
| 13 | level = sys.argv[2] # taille de LDA ( -5) voulu | 13 | level = sys.argv[2] # taille de LDA ( -5) voulu |
| 14 | output_dir = sys.argv[3] | 14 | output_dir = sys.argv[3] |
| 15 | 15 | ||
| 16 | lb=LabelBinarizer() | 16 | lb=LabelBinarizer() |
| 17 | #y_train=lb.fit_transform([utils.select(ligneid) for ligneid in origin_corps["LABEL"]["TRAIN"]]) | 17 | #y_train=lb.fit_transform([utils.select(ligneid) for ligneid in origin_corps["LABEL"]["TRAIN"]]) |
| 18 | 18 | ||
| 19 | 19 | ||
| 20 | data = shelve.open("{}/mmf_{}.shelve".format(output_dir,level),writeback=True) | 20 | data = shelve.open("{}/mmf_{}.shelve".format(output_dir,level),writeback=True) |
| 21 | data["LABEL"]= {} | 21 | data["LABEL"]= {} |
| 22 | data["LDA"] = {"ASR":{},"TRS":{}} | 22 | data["LDA"] = {"ASR":{},"TRS":{}} |
| 23 | for mod in ["ASR", "TRS" ]: | 23 | for mod in ["ASR", "TRS" ]: |
| 24 | train = pandas.read_table("{}/{}/train_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | 24 | train = pandas.read_table("{}/{}/train_{}.tab".format(input_dir, mod, level), sep=" ", header=None ) |
| 25 | dev = pandas.read_table("{}/{}/dev_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | 25 | dev = pandas.read_table("{}/{}/dev_{}.tab".format(input_dir, mod, level), sep=" ", header=None ) |
| 26 | test = pandas.read_table("{}/{}/test_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | 26 | test = pandas.read_table("{}/{}/test_{}.tab".format(input_dir, mod, level), sep=" ", header=None ) |
| 27 | 27 | ||
| 28 | y_train = train.iloc[:,0].apply(select) | 28 | y_train = train.iloc[:,0].apply(select) |
| 29 | y_dev = dev.iloc[:,0].apply(select) | 29 | y_dev = dev.iloc[:,0].apply(select) |
| 30 | y_test = test.iloc[:,0].apply(select) | 30 | y_test = test.iloc[:,0].apply(select) |
| 31 | lb.fit(y_train) | 31 | lb.fit(y_train) |
| 32 | data["LABEL"][mod]={"TRAIN":lb.transform(y_train),"DEV":lb.transform(y_dev), "TEST": lb.transform(y_test)} | 32 | data["LABEL"][mod]={"TRAIN":lb.transform(y_train),"DEV":lb.transform(y_dev), "TEST": lb.transform(y_test)} |
| 33 | 33 | ||
| 34 | # data["LDA"][mod]={'ASR':[]} | 34 | # data["LDA"][mod]={'ASR':[]} |
| 35 | print data["LDA"][mod] | ||
| 36 | print train.values | 35 | print train.values |
| 37 | data["LDA"][mod]["TRAIN"]=train.iloc[:,1:-1].values | 36 | data["LDA"][mod]["TRAIN"]=train.iloc[:,1:-1].values |
| 38 | data["LDA"][mod]["DEV"]=dev.iloc[:,1:-1].values | 37 | data["LDA"][mod]["DEV"]=dev.iloc[:,1:-1].values |
| 39 | data["LDA"][mod]["TEST"]=test.iloc[:,1:-1].values | 38 | data["LDA"][mod]["TEST"]=test.iloc[:,1:-1].values |
| 40 | 39 | ||
| 40 | print data["LDA"][mod]["TRAIN"].shape | ||
| 41 | data.sync() | 41 | data.sync() |
| 42 | data.close() | 42 | data.close() |
LDA/02-lda_split.py
| 1 | import gensim | File was deleted | |
| 2 | import os | ||
| 3 | import sys | ||
| 4 | import pickle | ||
| 5 | from gensim.models.ldamodel import LdaModel | ||
| 6 | from gensim.models.ldamulticore import LdaMulticore | ||
| 7 | from collections import Counter | ||
| 8 | import numpy as np | ||
| 9 | import codecs | ||
| 10 | import shelve | ||
| 11 | import logging | ||
| 12 | |||
| 13 | def calc_perp(in_dir,train): | ||
| 14 | name = in_dir.split("/")[-1] | ||
| 15 | # s40_it1_sw50_a0.01_e0.1_p6_c1000 | ||
| 16 | sw_size = int(name.split("_")[2][2:]) | ||
| 17 | |||
| 18 | logging.warning(" go {} ".format(name)) | ||
| 19 | |||
| 20 | |||
| 21 | logging.warning("Redo Vocab and stop") | ||
| 22 | asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) | ||
| 23 | trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) | ||
| 24 | asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] | ||
| 25 | trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] | ||
| 26 | stop_words=set(asr_sw) | set(trs_sw) | ||
| 27 | |||
| 28 | logging.warning("TRS to be done") | ||
| 29 | entry = Query() | ||
| 30 | value=db.search(entry.name == name) | ||
| 31 | if len(value) > 0 : | ||
| 32 | logging.warning("{} already done".format(name)) | ||
| 33 | return | ||
| 34 | |||
| 35 | dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] | ||
| 36 | lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir)) | ||
| 37 | perp_trs = lda_trs.log_perplexity(dev_trs) | ||
| 38 | logging.warning("ASR to be done") | ||
| 39 | dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] | ||
| 40 | lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir)) | ||
| 41 | perp_asr = lda_asr.log_perplexity(dev_asr) | ||
| 42 | logging.warning("ASR saving") | ||
| 43 | res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs} | ||
| 44 | return res_dict | ||
| 45 | |||
| 46 | |||
| 47 | |||
| 48 | |||
| 49 | def train_lda(out_dir,train,name,size,it,sw_size,alpha,eta,passes,chunk): | ||
| 50 | output_dir = "{}/s{}_it{}_sw{}_a{}_e{}_p{}_c{}".format(out_dir,size,it,sw_size,alpha,eta,passes,chunk) | ||
| 51 | os.mkdir(output_dir) | ||
| 52 | logging.info(output_dir+" to be done") | ||
| 53 | asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) | ||
| 54 | trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) | ||
| 55 | asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] | ||
| 56 | trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] | ||
| 57 | stop_words=set(asr_sw) | set(trs_sw) | ||
| 58 | |||
| 59 | logging.info("TRS to be done") | ||
| 60 | |||
| 61 | lda_trs = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=1000,iterations=it) | ||
| 62 | |||
| 63 | logging.info("ASR to be done") | ||
| 64 | lda_asr = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=1000,iterations=it) | ||
| 65 | |||
| 66 | #logger.info("ASR saving") | ||
| 67 | #lda_asr.save("{}/lda_asr.model".format(output_dir,name,size,it)) | ||
| 68 | #lda_trs.save("{}/lda_trs.model".format(output_dir,name,size,it)) | ||
| 69 | |||
| 70 | |||
| 71 | out_file_asr=codecs.open("{}/asr_wordTopic.txt".format(output_dir),"w","utf-8") | ||
| 72 | out_file_trs=codecs.open("{}/trs_wordTopic.txt".format(output_dir),"w","utf-8") | ||
| 73 | |||
| 74 | dico = train["vocab"] | ||
| 75 | print >>out_file_asr, ",\t".join( [ dico[x] for x in range(len(train["vocab"]))]) | ||
| 76 | for line in lda_asr.expElogbeta: | ||
| 77 | nline = line / np.sum(line) | ||
| 78 | print >>out_file_asr, ",\t".join( str(x) for x in nline) | ||
| 79 | out_file_asr.close() | ||
| 80 | |||
| 81 | print >>out_file_trs, ",\t".join( [ dico[x] for x in range(len(train["vocab"]))]) | ||
| 82 | for line in lda_trs.expElogbeta: | ||
| 83 | nline = line / np.sum(line) | ||
| 84 | print >>out_file_trs, ",\t".join( str(x) for x in nline) | ||
| 85 | out_file_trs.close() | ||
| 86 | |||
| 87 | K = lda_asr.num_topics | ||
| 88 | topicWordProbMat = lda_asr.print_topics(K,10) | ||
| 89 | out_file_asr=codecs.open("{}/asr_best10.txt".format(output_dir),"w","utf-8") | ||
| 90 | for i in topicWordProbMat: | ||
| 91 | print >>out_file_asr,i | ||
| 92 | out_file_asr.close() | ||
| 93 | |||
| 94 | K = lda_trs.num_topics | ||
| 95 | topicWordProbMat = lda_trs.print_topics(K,10) | ||
| 96 | out_file_trs=codecs.open("{}/trs_best10.txt".format(output_dir),"w","utf-8") | ||
| 97 | for i in topicWordProbMat: | ||
| 98 | print >>out_file_trs,i | ||
| 99 | out_file_trs.close() | ||
| 100 | |||
| 101 | if __name__ == "__main__": | ||
| 102 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) | ||
| 103 | |||
| 104 | input_shelve = sys.argv[1] | ||
| 105 | output_dir = sys.argv[2] | ||
| 106 | size = [ int(x) for x in sys.argv[3].split("_")] | ||
| 107 | workers = int(sys.argv[4]) | ||
| 108 | name = sys.argv[5] | ||
| 109 | it = [ int(x) for x in sys.argv[6].split("_")] | ||
| 110 | sw_size = [ int(x) for x in sys.argv[7].split("_")] | ||
| 111 | alpha = ["auto" , "symmetric"] + [ float(x) for x in sys.argv[8].split("_")] | ||
| 112 | eta = ["auto"] + [ float(x) for x in sys.argv[9].split("_")] | ||
| 113 | passes = [ int(x) for x in sys.argv[10].split("_")] | ||
| 114 | chunk = [ int(x) for x in sys.argv[11].split("_")] | ||
| 115 | |||
| 116 | #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir))) | ||
| 117 | train = shelve.open(input_shelve) | ||
| 118 | out_dir = "{}/{}".format(output_dir,name) | ||
| 119 | os.mkdir(out_dir) | ||
| 120 | |||
| 121 | for s in size: | ||
| 122 | for i in it : | ||
| 123 | for sw in sw_size: | ||
| 124 | for a in alpha: | ||
| 125 | for e in eta: | ||
| 126 | for p in passes: | ||
| 127 | for c in chunk: | ||
| 128 | train_lda(out_dir,train,name,s,i,sw,a,e,p,c) | ||
| 129 | 1 | import gensim | |
| 130 | 2 | import os |
LDA/02b-lda_order.py
| 1 | import gensim | File was deleted | |
| 2 | import os | ||
| 3 | import sys | ||
| 4 | import pickle | ||
| 5 | from gensim.models.ldamodel import LdaModel | ||
| 6 | from gensim.models.ldamulticore import LdaMulticore | ||
| 7 | from collections import Counter | ||
| 8 | import numpy as np | ||
| 9 | import codecs | ||
| 10 | import shelve | ||
| 11 | import logging | ||
| 12 | import dill | ||
| 13 | from tinydb import TinyDB, where, Query | ||
| 14 | import time | ||
| 15 | from joblib import Parallel, delayed | ||
| 16 | |||
| 17 | def calc_perp(models,train): | ||
| 18 | |||
| 19 | |||
| 20 | stop_words=models[1] | ||
| 21 | name = models[0] | ||
| 22 | |||
| 23 | logging.warning(" go {} ".format(name)) | ||
| 24 | logging.warning("TRS to be done") | ||
| 25 | entry = Query() | ||
| 26 | value=db.search(entry.name == name) | ||
| 27 | if len(value) > 0 : | ||
| 28 | logging.warning("{} already done".format(name)) | ||
| 29 | return | ||
| 30 | |||
| 31 | dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] | ||
| 32 | lda_trs = models[2] | ||
| 33 | perp_trs = lda_trs.log_perplexity(dev_trs) | ||
| 34 | |||
| 35 | logging.warning("ASR to be done") | ||
| 36 | dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] | ||
| 37 | lda_asr = models[5] | ||
| 38 | perp_asr = lda_asr.log_perplexity(dev_asr) | ||
| 39 | logging.warning("ASR saving") | ||
| 40 | res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs } | ||
| 41 | return res_dict | ||
| 42 | |||
| 43 | |||
| 44 | |||
| 45 | |||
| 46 | def train_lda(out_dir,train,size,it,sw_size,alpha,eta,passes,chunk): | ||
| 47 | name = "s{}_it{}_sw{}_a{}_e{}_p{}_c{}".format(size,it,sw_size,alpha,eta,passes,chunk) | ||
| 48 | logging.warning(name) | ||
| 49 | deep_out_dir = out_dir+"/"+name | ||
| 50 | if os.path.isdir(deep_out_dir): | ||
| 51 | logging.error(name+" already done") | ||
| 52 | return | ||
| 53 | logging.warning(name+" to be done") | ||
| 54 | asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) | ||
| 55 | trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) | ||
| 56 | asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] | ||
| 57 | trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] | ||
| 58 | stop_words=set(asr_sw) | set(trs_sw) | ||
| 59 | |||
| 60 | logging.warning("TRS to be done") | ||
| 61 | |||
| 62 | lda_trs = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=chunk,iterations=it,alpha=alpha,eta=eta,passes=passes) | ||
| 63 | |||
| 64 | logging.warning("ASR to be done") | ||
| 65 | lda_asr = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=chunk,iterations=it,alpha=alpha,eta=eta,passes=passes) | ||
| 66 | |||
| 67 | dico = train["vocab"] | ||
| 68 | word_list = [ dico[x] for x in range(len(train["vocab"]))] | ||
| 69 | asr_probs = [] | ||
| 70 | for line in lda_asr.expElogbeta: | ||
| 71 | nline = line / np.sum(line) | ||
| 72 | asr_probs.append([ str(x) for x in nline]) | ||
| 73 | trs_probs = [] | ||
| 74 | for line in lda_trs.expElogbeta: | ||
| 75 | nline = line / np.sum(line) | ||
| 76 | trs_probs.append([str(x) for x in nline]) | ||
| 77 | |||
| 78 | K = lda_asr.num_topics | ||
| 79 | topicWordProbMat_asr = lda_asr.print_topics(K,10) | ||
| 80 | |||
| 81 | K = lda_trs.num_topics | ||
| 82 | topicWordProbMat_trs = lda_trs.print_topics(K,10) | ||
| 83 | os.mkdir(deep_out_dir) | ||
| 84 | dill.dump([x for x in stop_words],open(deep_out_dir+"/stopwords.dill","w")) | ||
| 85 | lda_asr.save(deep_out_dir+"/lda_asr.model") | ||
| 86 | lda_trs.save(deep_out_dir+"/lda_trs.model") | ||
| 87 | dill.dump([x for x in asr_probs],open(deep_out_dir+"/lda_asr_probs.dill","w")) | ||
| 88 | dill.dump([x for x in trs_probs],open(deep_out_dir+"/lda_trs_probs.dill","w")) | ||
| 89 | |||
| 90 | return [name, stop_words, lda_asr , asr_probs , topicWordProbMat_asr, lda_trs, trs_probs, topicWordProbMat_trs] | ||
| 91 | |||
| 92 | def train_one(name,train,s,i,sw,a,e,p,c): | ||
| 93 | st=time.time() | ||
| 94 | logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]])) | ||
| 95 | models = train_lda(name,train,s,i,sw,a,e,p,c) | ||
| 96 | if models: | ||
| 97 | m = calc_perp(models,train) | ||
| 98 | #dill.dump(models,open("{}/{}.dill".format(name,models[0]),"wb")) | ||
| 99 | else : | ||
| 100 | m = None | ||
| 101 | e = time.time() | ||
| 102 | logging.warning("fin en : {}".format(e-st)) | ||
| 103 | return m | ||
| 104 | |||
| 105 | |||
| 106 | |||
| 107 | |||
| 108 | if __name__ == "__main__": | ||
| 109 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) | ||
| 110 | |||
| 111 | input_shelve = sys.argv[1] | ||
| 112 | db_path = sys.argv[2] | ||
| 113 | size = [ int(x) for x in sys.argv[3].split("_")] | ||
| 114 | workers = int(sys.argv[4]) | ||
| 115 | name = sys.argv[5] | ||
| 116 | it = [ int(x) for x in sys.argv[6].split("_")] | ||
| 117 | sw_size = [ int(x) for x in sys.argv[7].split("_")] | ||
| 118 | if sys.argv[8] != "None" : | ||
| 119 | alpha = [ "symmetric", "auto" ] + [ float(x) for x in sys.argv[8].split("_")] | ||
| 120 | eta = ["auto"] + [ float(x) for x in sys.argv[9].split("_")] | ||
| 121 | else : | ||
| 122 | alpha = ["symmetric"] | ||
| 123 | eta = ["auto"] | ||
| 124 | passes = [ int(x) for x in sys.argv[10].split("_")] | ||
| 125 | chunk = [ int(x) for x in sys.argv[11].split("_")] | ||
| 126 | |||
| 127 | #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir))) | ||
| 128 | train = shelve.open(input_shelve) | ||
| 129 | try : | ||
| 130 | os.mkdir(name) | ||
| 131 | except : | ||
| 132 | logging.warning(" folder already existe " ) | ||
| 133 | db = TinyDB(db_path) | ||
| 134 | nb_model = len(passes) * len(chunk) * len(it) * len(sw_size) * len(alpha) * len(eta) * len(size) | ||
| 135 | logging.warning(" hey will train {} models ".format(nb_model)) | ||
| 136 | |||
| 137 | args_list=[] | ||
| 138 | for p in passes: | ||
| 139 | for c in chunk: | ||
| 140 | for i in it : | ||
| 141 | for sw in sw_size: | ||
| 142 | for a in alpha: | ||
| 143 | for e in eta: | ||
| 144 | for s in size: | ||
| 145 | args_list.append((name,train,s,i,sw,a,e,p,c)) | ||
| 146 | res_list= Parallel(n_jobs=15)(delayed(train_one)(*args) for args in args_list) | ||
| 147 | for m in res_list : | ||
| 148 | db.insert(m) | ||
| 149 | |||
| 150 | 1 | import gensim |
LDA/04b-mini_ae.py
| 1 | 1 | ||
| 2 | # coding: utf-8 | 2 | # coding: utf-8 |
| 3 | 3 | ||
| 4 | # In[2]: | 4 | # In[2]: |
| 5 | 5 | ||
| 6 | # Import | 6 | # Import |
| 7 | import gensim | 7 | import gensim |
| 8 | from scipy import sparse | 8 | from scipy import sparse |
| 9 | import itertools | 9 | import itertools |
| 10 | from sklearn import preprocessing | 10 | from sklearn import preprocessing |
| 11 | from keras.models import Sequential | 11 | from keras.models import Sequential |
| 12 | from keras.optimizers import SGD,Adam | 12 | from keras.optimizers import SGD,Adam |
| 13 | from mlp import * | 13 | from mlp import * |
| 14 | import mlp | 14 | import mlp |
| 15 | import sklearn.metrics | 15 | import sklearn.metrics |
| 16 | import shelve | 16 | import shelve |
| 17 | import pickle | 17 | import pickle |
| 18 | from utils import * | 18 | from utils import * |
| 19 | import sys | 19 | import sys |
| 20 | import os | 20 | import os |
| 21 | import json | 21 | import json |
| 22 | # In[4]: | 22 | # In[4]: |
| 23 | 23 | ||
| 24 | sparse_model=shelve.open("{}".format(sys.argv[2])) | 24 | sparse_model=shelve.open("{}".format(sys.argv[2])) |
| 25 | in_dir = sys.argv[1] | 25 | in_dir = sys.argv[1] |
| 26 | infer_model=shelve.open("{}/infer.shelve".format(in_dir)) | 26 | infer_model=shelve.open("{}/infer.shelve".format(in_dir)) |
| 27 | #['ASR', 'TRS', 'LABEL'] | 27 | #['ASR', 'TRS', 'LABEL'] |
| 28 | # In[6]: | 28 | # In[6]: |
| 29 | ASR=sparse_model["ASR_wid"] | 29 | ASR=sparse_model["ASR_wid"] |
| 30 | TRS=sparse_model["TRS_wid"] | 30 | TRS=sparse_model["TRS_wid"] |
| 31 | LABEL=sparse_model["LABEL"] | 31 | LABEL=sparse_model["LABEL"] |
| 32 | 32 | ||
| 33 | 33 | ||
| 34 | hidden_size=40 | 34 | hidden_size=40 |
| 35 | input_activation="tanh" | 35 | input_activation="tanh" |
| 36 | out_activation="tanh" | 36 | out_activation="tanh" |
| 37 | loss="mse" | 37 | loss="mse" |
| 38 | epochs=500 | 38 | epochs=500 |
| 39 | batch=1 | 39 | batch=1 |
| 40 | patience=60 | 40 | patience=60 |
| 41 | do_do=False | 41 | do_do=False |
| 42 | sgd = Adam(lr=0.00001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | 42 | sgd = Adam(lr=0.00001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) |
| 43 | try : | 43 | try : |
| 44 | sgd_repr=sgd.get_config()["name"] | 44 | sgd_repr=sgd.get_config()["name"] |
| 45 | except AttributeError : | 45 | except AttributeError : |
| 46 | sgd_repr=sgd | 46 | sgd_repr=sgd |
| 47 | 47 | ||
| 48 | params={ "h1" : hidden_size, | 48 | params={ "h1" : hidden_size, |
| 49 | "inside_activation" : input_activation, | 49 | "inside_activation" : input_activation, |
| 50 | "out_activation" : out_activation, | 50 | "out_activation" : out_activation, |
| 51 | "do_dropout": do_do, | 51 | "do_dropout": do_do, |
| 52 | "loss" : loss, | 52 | "loss" : loss, |
| 53 | "epochs" : epochs , | 53 | "epochs" : epochs , |
| 54 | "batch_size" : batch, | 54 | "batch_size" : batch, |
| 55 | "patience" : patience, | 55 | "patience" : patience, |
| 56 | "sgd" : sgd_repr} | 56 | "sgd" : sgd_repr} |
| 57 | name = "_".join([ str(x) for x in params.values()]) | 57 | name = "_".join([ str(x) for x in params.values()]) |
| 58 | try: | 58 | try: |
| 59 | os.mkdir("{}/{}".format(in_dir,name)) | 59 | os.mkdir("{}/{}".format(in_dir,name)) |
| 60 | except: | 60 | except: |
| 61 | pass | 61 | pass |
| 62 | db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True) | 62 | db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True) |
| 63 | db["params"] = params | 63 | db["params"] = params |
| 64 | db["LABEL"]=LABEL | 64 | db["LABEL"]=LABEL |
| 65 | # | 65 | # |
| 66 | json.dump(params, | 66 | json.dump(params, |
| 67 | open("{}/{}/ae_model.json".format(in_dir,name),"w"), | 67 | open("{}/{}/ae_model.json".format(in_dir,name),"w"), |
| 68 | indent=4) | 68 | indent=4) |
| 69 | 69 | ||
| 70 | keys = ["ASR","TRS"] | 70 | keys = ["ASR","TRS"] |
| 71 | 71 | ||
| 72 | mlp_h = [ 40 , 25 , 40] | 72 | mlp_h = [ 512 , 1024 , 2048] |
| 73 | mlp_loss ="categorical_crossentropy" | 73 | mlp_loss ="categorical_crossentropy" |
| 74 | mlp_dropouts = [0,0,0,0] | 74 | mlp_dropouts = [0,0,0,0] |
| 75 | mlp_sgd = Adam(0.0001) | 75 | mlp_sgd = Adam(0.0001) |
| 76 | mlp_epochs = 200 | 76 | mlp_epochs = 200 |
| 77 | mlp_batch_size = 8 | 77 | mlp_batch_size = 8 |
| 78 | 78 | ||
| 79 | db["AE"] = {} | 79 | db["AE"] = {} |
| 80 | for mod in keys : | 80 | for mod in keys : |
| 81 | res=train_ae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"],[params["h1"]],patience = params["patience"],sgd=sgd,in_activation="tanh",out_activation="tanh",loss=loss,epochs=epochs,batch_size=batch,verbose=0) | 81 | res=train_ae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"],[params["h1"]],patience = params["patience"],sgd=sgd,in_activation="tanh",out_activation="tanh",loss=loss,epochs=epochs,batch_size=batch,verbose=0) |
| 82 | mlp_res_list=[] | 82 | mlp_res_list=[] |
| 83 | for layer in res : | 83 | for layer in res : |
| 84 | mlp_res_list.append(train_mlp(layer[0],LABEL["TRAIN"],layer[1],LABEL["DEV"],layer[2],LABEL["TEST"],mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size,fit_verbose=0)) | 84 | mlp_res_list.append(train_mlp(layer[0],LABEL["TRAIN"],layer[1],LABEL["DEV"],layer[2],LABEL["TEST"],mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size,fit_verbose=0)) |
| 85 | db["AE"][mod]=mlp_res_list | 85 | db["AE"][mod]=mlp_res_list |
| 86 | 86 | ||
| 87 | mod = "ASR" | 87 | mod = "ASR" |
| 88 | mod2= "TRS" | 88 | mod2= "TRS" |
| 89 | mlp_res_list=[] | 89 | mlp_res_list=[] |
| 90 | 90 | ||
| 91 | res = train_ae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"],[params["h1"]],dropouts=[0],patience = params["patience"],sgd=sgd,in_activation="tanh",out_activation="tanh",loss=loss,epochs=epochs,batch_size=batch,y_train=infer_model["LDA"][mod]["TRAIN"],y_dev=infer_model["LDA"][mod2]["DEV"],y_test=infer_model["LDA"][mod2]["TEST"]) | 91 | res = train_ae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"],[params["h1"]],dropouts=[0],patience = params["patience"],sgd=sgd,in_activation="tanh",out_activation="tanh",loss=loss,epochs=epochs,batch_size=batch,y_train=infer_model["LDA"][mod]["TRAIN"],y_dev=infer_model["LDA"][mod2]["DEV"],y_test=infer_model["LDA"][mod2]["TEST"]) |
| 92 | for layer in res : | 92 | for layer in res : |
| 93 | mlp_res_list.append(train_mlp(layer[0],LABEL["TRAIN"],layer[1],LABEL["DEV"],layer[2],LABEL["TEST"],mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size,fit_verbose=0)) | 93 | mlp_res_list.append(train_mlp(layer[0],LABEL["TRAIN"],layer[1],LABEL["DEV"],layer[2],LABEL["TEST"],mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size,fit_verbose=0)) |
| 94 | 94 | ||
| 95 | db["AE"]["SPE"] = mlp_res_list | 95 | db["AE"]["SPE"] = mlp_res_list |
| 96 | 96 | ||
| 97 | 97 | ||
| 98 | db.close() | 98 | db.close() |
| 99 | 99 |
LDA/04e-mm_vae.py
| 1 | 1 | ||
| 2 | # coding: utf-8 | 2 | # coding: utf-8 |
| 3 | import gensim | 3 | import gensim |
| 4 | from scipy import sparse | 4 | from scipy import sparse |
| 5 | import itertools | 5 | import itertools |
| 6 | from sklearn import preprocessing | 6 | from sklearn import preprocessing |
| 7 | from keras.models import Sequential | 7 | from keras.models import Sequential |
| 8 | from keras.optimizers import SGD,Adam | 8 | from keras.optimizers import SGD,Adam |
| 9 | from mlp import * | 9 | from mlp import * |
| 10 | from vae import * | 10 | from vae import * |
| 11 | import sklearn.metrics | 11 | import sklearn.metrics |
| 12 | import shelve | 12 | import shelve |
| 13 | import pickle | 13 | import pickle |
| 14 | from utils import * | 14 | from utils import * |
| 15 | import sys | 15 | import sys |
| 16 | import os | 16 | import os |
| 17 | import json | 17 | import json |
| 18 | # In[4]: | 18 | # In[4]: |
| 19 | 19 | ||
| 20 | infer_model=shelve.open("{}".format(sys.argv[2])) | 20 | infer_model=shelve.open("{}".format(sys.argv[2])) |
| 21 | in_dir = sys.argv[1] | 21 | in_dir = sys.argv[1] |
| 22 | #['ASR', 'TRS', 'LABEL'] | 22 | #['ASR', 'TRS', 'LABEL'] |
| 23 | # In[6]: | 23 | # In[6]: |
| 24 | if len(sys.argv) > 4 : | 24 | if len(sys.argv) > 4 : |
| 25 | features_key = sys.argv[4] | 25 | features_key = sys.argv[4] |
| 26 | else : | 26 | else : |
| 27 | features_key = "LDA" | 27 | features_key = "LDA" |
| 28 | 28 | ||
| 29 | save_projection = True | 29 | save_projection = True |
| 30 | json_conf =json.load(open(sys.argv[3])) | 30 | json_conf =json.load(open(sys.argv[3])) |
| 31 | vae_conf = json_conf["vae"] | 31 | vae_conf = json_conf["vae"] |
| 32 | 32 | ||
| 33 | hidden_size= vae_conf["hidden_size"] | 33 | hidden_size= vae_conf["hidden_size"] |
| 34 | input_activation=vae_conf["input_activation"] | 34 | input_activation=vae_conf["input_activation"] |
| 35 | output_activation=vae_conf["output_activation"] | 35 | output_activation=vae_conf["output_activation"] |
| 36 | epochs=vae_conf["epochs"] | 36 | epochs=vae_conf["epochs"] |
| 37 | batch=vae_conf["batch"] | 37 | batch=vae_conf["batch"] |
| 38 | patience=vae_conf["patience"] | 38 | patience=vae_conf["patience"] |
| 39 | latent_dim = vae_conf["latent"] | 39 | latent_dim = vae_conf["latent"] |
| 40 | try: | 40 | try: |
| 41 | k = vae_conf["sgd"] | 41 | k = vae_conf["sgd"] |
| 42 | if vae_conf["sgd"]["name"] == "adam": | 42 | if vae_conf["sgd"]["name"] == "adam": |
| 43 | sgd = Adam(lr=vae_conf["sgd"]["lr"])#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | 43 | sgd = Adam(lr=vae_conf["sgd"]["lr"])#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) |
| 44 | elif vae_conf["sgd"]["name"] == "sgd": | 44 | elif vae_conf["sgd"]["name"] == "sgd": |
| 45 | sgd = SGD(lr=vae_conf["sgd"]["lr"]) | 45 | sgd = SGD(lr=vae_conf["sgd"]["lr"]) |
| 46 | except: | 46 | except: |
| 47 | sgd = vae_conf["sgd"] | 47 | sgd = vae_conf["sgd"] |
| 48 | 48 | ||
| 49 | mlp_conf = json_conf["mlp"] | 49 | mlp_conf = json_conf["mlp"] |
| 50 | mlp_h = mlp_conf["hidden_size"] | 50 | mlp_h = mlp_conf["hidden_size"] |
| 51 | mlp_loss = mlp_conf["loss"] | 51 | mlp_loss = mlp_conf["loss"] |
| 52 | mlp_dropouts = mlp_conf["do"] | 52 | mlp_dropouts = mlp_conf["do"] |
| 53 | mlp_epochs = mlp_conf["epochs"] | 53 | mlp_epochs = mlp_conf["epochs"] |
| 54 | mlp_batch_size = mlp_conf["batch"] | 54 | mlp_batch_size = mlp_conf["batch"] |
| 55 | mlp_input_activation=mlp_conf["input_activation"] | 55 | mlp_input_activation=mlp_conf["input_activation"] |
| 56 | mlp_output_activation=mlp_conf["output_activation"] | 56 | mlp_output_activation=mlp_conf["output_activation"] |
| 57 | 57 | ||
| 58 | 58 | ||
| 59 | try: | 59 | try: |
| 60 | k = mlp_conf["sgd"] | 60 | k = mlp_conf["sgd"] |
| 61 | if mlp_conf["sgd"]["name"] == "adam": | 61 | if mlp_conf["sgd"]["name"] == "adam": |
| 62 | mlp_sgd = Adam(lr=mlp_conf["sgd"]["lr"])#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | 62 | mlp_sgd = Adam(lr=mlp_conf["sgd"]["lr"])#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) |
| 63 | elif mlp_conf["sgd"]["name"] == "sgd": | 63 | elif mlp_conf["sgd"]["name"] == "sgd": |
| 64 | mlp_sgd = SGD(lr=mlp_conf["sgd"]["lr"]) | 64 | mlp_sgd = SGD(lr=mlp_conf["sgd"]["lr"]) |
| 65 | except: | 65 | except: |
| 66 | mlp_sgd = mlp_conf["sgd"] | 66 | mlp_sgd = mlp_conf["sgd"] |
| 67 | 67 | ||
| 68 | 68 | ||
| 69 | name = json_conf["name"] | 69 | name = json_conf["name"] |
| 70 | 70 | ||
| 71 | try : | 71 | try : |
| 72 | print "make folder " | 72 | print "make folder " |
| 73 | os.mkdir("{}/{}".format(in_dir,name)) | 73 | os.mkdir("{}/{}".format(in_dir,name)) |
| 74 | except: | 74 | except: |
| 75 | print "folder not maked" | 75 | print "folder not maked" |
| 76 | pass | 76 | pass |
| 77 | 77 | ||
| 78 | 78 | ||
| 79 | db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True) | 79 | db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True) |
| 80 | db["LABEL"]=infer_model["LABEL"] | 80 | db["LABEL"]=infer_model["LABEL"] |
| 81 | # | 81 | # |
| 82 | 82 | ||
| 83 | 83 | ||
| 84 | keys = infer_model[features_key].keys() | 84 | keys = infer_model[features_key].keys() |
| 85 | 85 | ||
| 86 | db["VAE"] = {} | 86 | db["VAE"] = {} |
| 87 | db[features_key] = {} | 87 | db[features_key] = {} |
| 88 | for mod in keys : | 88 | for mod in keys : |
| 89 | #print mod | 89 | #print mod |
| 90 | db[features_key][mod] = train_mlp(infer_model[features_key][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"], | 90 | db[features_key][mod] = train_mlp(infer_model[features_key][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"], |
| 91 | infer_model[features_key][mod]["DEV"],infer_model["LABEL"][mod]["DEV"], | 91 | infer_model[features_key][mod]["DEV"],infer_model["LABEL"][mod]["DEV"], |
| 92 | infer_model[features_key][mod]["TEST"],infer_model["LABEL"][mod]["TEST"], | 92 | infer_model[features_key][mod]["TEST"],infer_model["LABEL"][mod]["TEST"], |
| 93 | mlp_h ,sgd=mlp_sgd, | 93 | mlp_h ,sgd=mlp_sgd, |
| 94 | epochs=mlp_epochs, | 94 | epochs=mlp_epochs, |
| 95 | batch_size=mlp_batch_size, | 95 | batch_size=mlp_batch_size, |
| 96 | input_activation=input_activation, | 96 | input_activation=input_activation, |
| 97 | output_activation=mlp_output_activation, | 97 | output_activation=mlp_output_activation, |
| 98 | dropouts=mlp_dropouts, | 98 | dropouts=mlp_dropouts, |
| 99 | fit_verbose=0) | 99 | fit_verbose=0) |
| 100 | 100 | ||
| 101 | res=train_vae(infer_model[features_key][mod]["TRAIN"],infer_model[features_key][mod]["DEV"],infer_model[features_key][mod]["TEST"], | 101 | res=train_vae(infer_model[features_key][mod]["TRAIN"],infer_model[features_key][mod]["DEV"],infer_model[features_key][mod]["TEST"], |
| 102 | hidden_size=hidden_size[0], | 102 | hidden_size=hidden_size[0], |
| 103 | latent_dim=latent_dim,sgd=sgd, | 103 | latent_dim=latent_dim,sgd=sgd, |
| 104 | input_activation=input_activation,output_activation=output_activation, | 104 | input_activation=input_activation,output_activation=output_activation, |
| 105 | nb_epochs=epochs,batch_size=batch) | 105 | nb_epochs=epochs,batch_size=batch) |
| 106 | mlp_res_list=[] | 106 | mlp_res_list=[] |
| 107 | for nb,layer in enumerate(res) : | 107 | for nb,layer in enumerate(res) : |
| 108 | if save_projection: | 108 | if save_projection: |
| 109 | pd = pandas.DataFrame(layer[0]) | 109 | pd = pandas.DataFrame(layer[0]) |
| 110 | col_count = (pd.sum(axis=0) != 0) | 110 | col_count = (pd.sum(axis=0) != 0) |
| 111 | pd = pd.loc[:,cyyol_count] | 111 | pd = pd.loc[:,col_count] |
| 112 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"TRAIN") | 112 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"TRAIN") |
| 113 | pd = pandas.DataFrame(layer[1]) | 113 | pd = pandas.DataFrame(layer[1]) |
| 114 | pd = pd.loc[:,col_count] | 114 | pd = pd.loc[:,col_count] |
| 115 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"DEV") | 115 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"DEV") |
| 116 | pd = pandas.DataFrame(layer[2]) | 116 | pd = pandas.DataFrame(layer[2]) |
| 117 | pd = pd.loc[:,col_count] | 117 | pd = pd.loc[:,col_count] |
| 118 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"TEST") | 118 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"TEST") |
| 119 | del pd | 119 | del pd |
| 120 | 120 | ||
| 121 | mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], | 121 | mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], |
| 122 | layer[1],infer_model["LABEL"][mod]["DEV"], | 122 | layer[1],infer_model["LABEL"][mod]["DEV"], |
| 123 | layer[2],infer_model["LABEL"][mod]["TEST"], | 123 | layer[2],infer_model["LABEL"][mod]["TEST"], |
| 124 | mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, | 124 | mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, |
| 125 | output_activation=mlp_output_activation, | 125 | output_activation=mlp_output_activation, |
| 126 | input_activation=input_activation, | 126 | input_activation=input_activation, |
| 127 | batch_size=mlp_batch_size,fit_verbose=0)) | 127 | batch_size=mlp_batch_size,fit_verbose=0)) |
| 128 | db["VAE"][mod]=mlp_res_list | 128 | db["VAE"][mod]=mlp_res_list |
| 129 | 129 | ||
| 130 | if "ASR" in keys and "TRS" in keys : | 130 | if "ASR" in keys and "TRS" in keys : |
| 131 | mod = "ASR" | 131 | mod = "ASR" |
| 132 | mod2= "TRS" | 132 | mod2= "TRS" |
| 133 | mlp_res_list=[] | 133 | mlp_res_list=[] |
| 134 | 134 | ||
| 135 | res = train_vae(infer_model[features_key][mod]["TRAIN"], | 135 | res = train_vae(infer_model[features_key][mod]["TRAIN"], |
| 136 | infer_model[features_key][mod]["DEV"], | 136 | infer_model[features_key][mod]["DEV"], |
| 137 | infer_model[features_key][mod]["TEST"], | 137 | infer_model[features_key][mod]["TEST"], |
| 138 | hidden_size=hidden_size[0], | 138 | hidden_size=hidden_size[0], |
| 139 | sgd=sgd,input_activation=input_activation,output_activation=output_activation, | 139 | sgd=sgd,input_activation=input_activation,output_activation=output_activation, |
| 140 | latent_dim=latent_dim, | 140 | latent_dim=latent_dim, |
| 141 | nb_epochs=epochs, | 141 | nb_epochs=epochs, |
| 142 | batch_size=batch, | 142 | batch_size=batch, |
| 143 | y_train=infer_model[features_key][mod2]["TRAIN"], | 143 | y_train=infer_model[features_key][mod2]["TRAIN"], |
| 144 | y_dev=infer_model[features_key][mod2]["DEV"], | 144 | y_dev=infer_model[features_key][mod2]["DEV"], |
| 145 | y_test=infer_model[features_key][mod2]["TEST"]) | 145 | y_test=infer_model[features_key][mod2]["TEST"]) |
| 146 | 146 | ||
| 147 | for nb,layer in enumerate(res) : | 147 | for nb,layer in enumerate(res) : |
| 148 | if save_projection: | 148 | if save_projection: |
| 149 | pd = pandas.DataFrame(layer[0]) | 149 | pd = pandas.DataFrame(layer[0]) |
| 150 | col_count = (pd.sum(axis=0) != 0) | 150 | col_count = (pd.sum(axis=0) != 0) |
| 151 | pd = pd.loc[:,col_count] | 151 | pd = pd.loc[:,col_count] |
| 152 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"TRAIN") | 152 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"TRAIN") |
| 153 | pd = pandas.DataFrame(layer[1]) | 153 | pd = pandas.DataFrame(layer[1]) |
| 154 | pd = pd.loc[:,col_count] | 154 | pd = pd.loc[:,col_count] |
| 155 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"DEV") | 155 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"DEV") |
| 156 | pd = pandas.DataFrame(layer[2]) | 156 | pd = pandas.DataFrame(layer[2]) |
| 157 | pd = pd.loc[:,col_count] | 157 | pd = pd.loc[:,col_count] |
| 158 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"TEST") | 158 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"TEST") |
| 159 | del pd | 159 | del pd |
| 160 | 160 | ||
| 161 | mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], | 161 | mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], |
| 162 | layer[1],infer_model["LABEL"][mod]["DEV"], | 162 | layer[1],infer_model["LABEL"][mod]["DEV"], |
| 163 | layer[2],infer_model["LABEL"][mod]["TEST"], | 163 | layer[2],infer_model["LABEL"][mod]["TEST"], |
| 164 | mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, | 164 | mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, |
| 165 | output_activation=mlp_output_activation, | 165 | output_activation=mlp_output_activation, |
| 166 | input_activation=input_activation, | 166 | input_activation=input_activation, |
| 167 | batch_size=mlp_batch_size,fit_verbose=0)) | 167 | batch_size=mlp_batch_size,fit_verbose=0)) |
| 168 | 168 | ||
| 169 | db["VAE"]["SPE"] = mlp_res_list | 169 | db["VAE"]["SPE"] = mlp_res_list |
| 170 | 170 | ||
| 171 | db.sync() | 171 | db.sync() |
| 172 | db.close() | 172 | db.close() |
| 173 | 173 |
LDA/run.sh
| 1 | python 00-prepross.py | File was deleted | |
| 2 | python 02-lda_split.py DECODA_list_wid.shelve output_v1/ 100 12 test2 1 400 | ||
| 3 | python 03-mono_perplex.py DECODA_list_wid.shelve output_v1/test2 output_v1/t2db.json | ||
| 4 | 1 | python 00-prepross.py |
LDA/vae.py
| 1 | '''This script demonstrates how to build a variational autoencoder with Keras. | 1 | '''This script demonstrates how to build a variational autoencoder with Keras. |
| 2 | Reference: "Auto-Encoding Variational Bayes" https://arxiv.org/abs/1312.6114 | 2 | Reference: "Auto-Encoding Variational Bayes" https://arxiv.org/abs/1312.6114 |
| 3 | ''' | 3 | ''' |
| 4 | 4 | ||
| 5 | import itertools | 5 | import itertools |
| 6 | import sys | 6 | import sys |
| 7 | import json | 7 | import json |
| 8 | 8 | ||
| 9 | import numpy as np | 9 | import numpy as np |
| 10 | import matplotlib.pyplot as plt | 10 | import matplotlib.pyplot as plt |
| 11 | from scipy import sparse | 11 | from scipy import sparse |
| 12 | import scipy.io | 12 | import scipy.io |
| 13 | 13 | ||
| 14 | from keras.layers import Input, Dense, Lambda | 14 | from keras.layers import Input, Dense, Lambda |
| 15 | from keras.models import Model | 15 | from keras.models import Model |
| 16 | from keras import backend as K | 16 | from keras import backend as K |
| 17 | from keras import objectives | 17 | from keras import objectives |
| 18 | from keras.datasets import mnist | 18 | from keras.datasets import mnist |
| 19 | from keras.callbacks import EarlyStopping,Callback | 19 | from keras.callbacks import EarlyStopping,Callback |
| 20 | 20 | ||
| 21 | import pandas | 21 | import pandas |
| 22 | import shelve | 22 | import shelve |
| 23 | import pickle | 23 | import pickle |
| 24 | 24 | ||
| 25 | 25 | ||
| 26 | class ZeroStopping(Callback): | 26 | class ZeroStopping(Callback): |
| 27 | '''Stop training when a monitored quantity has stopped improving. | 27 | '''Stop training when a monitored quantity has stopped improving. |
| 28 | # Arguments | 28 | # Arguments |
| 29 | monitor: quantity to be monitored. | 29 | monitor: quantity to be monitored. |
| 30 | patience: number of epochs with no improvement | 30 | patience: number of epochs with no improvement |
| 31 | after which training will be stopped. | 31 | after which training will be stopped. |
| 32 | verbose: verbosity mode. | 32 | verbose: verbosity mode. |
| 33 | mode: one of {auto, min, max}. In 'min' mode, | 33 | mode: one of {auto, min, max}. In 'min' mode, |
| 34 | training will stop when the quantity | 34 | training will stop when the quantity |
| 35 | monitored has stopped decreasing; in 'max' | 35 | monitored has stopped decreasing; in 'max' |
| 36 | mode it will stop when the quantity | 36 | mode it will stop when the quantity |
| 37 | monitored has stopped increasing. | 37 | monitored has stopped increasing. |
| 38 | ''' | 38 | ''' |
| 39 | def __init__(self, monitor='val_loss', verbose=0, mode='auto', thresh = 0): | 39 | def __init__(self, monitor='val_loss', verbose=0, mode='auto', thresh = 0): |
| 40 | super(ZeroStopping, self).__init__() | 40 | super(ZeroStopping, self).__init__() |
| 41 | 41 | ||
| 42 | self.monitor = monitor | 42 | self.monitor = monitor |
| 43 | self.verbose = verbose | 43 | self.verbose = verbose |
| 44 | self.thresh = thresh # is a rythme | 44 | self.thresh = thresh # is a rythme |
| 45 | 45 | ||
| 46 | if mode not in ['auto', 'min', 'max']: | 46 | if mode not in ['auto', 'min', 'max']: |
| 47 | warnings.warn('EarlyStopping mode %s is unknown, ' | 47 | warnings.warn('EarlyStopping mode %s is unknown, ' |
| 48 | 'fallback to auto mode.' % (self.mode), | 48 | 'fallback to auto mode.' % (self.mode), |
| 49 | RuntimeWarning) | 49 | RuntimeWarning) |
| 50 | mode = 'auto' | 50 | mode = 'auto' |
| 51 | 51 | ||
| 52 | if mode == 'min': | 52 | if mode == 'min': |
| 53 | self.monitor_op = np.less | 53 | self.monitor_op = np.less |
| 54 | elif mode == 'max': | 54 | elif mode == 'max': |
| 55 | self.monitor_op = np.greater | 55 | self.monitor_op = np.greater |
| 56 | else: | 56 | else: |
| 57 | if 'acc' in self.monitor: | 57 | if 'acc' in self.monitor: |
| 58 | self.monitor_op = np.greater | 58 | self.monitor_op = np.greater |
| 59 | else: | 59 | else: |
| 60 | self.monitor_op = np.less | 60 | self.monitor_op = np.less |
| 61 | 61 | ||
| 62 | def on_epoch_end(self, epoch, logs={}): | 62 | def on_epoch_end(self, epoch, logs={}): |
| 63 | current = logs.get(self.monitor) | 63 | current = logs.get(self.monitor) |
| 64 | if current is None: | 64 | if current is None: |
| 65 | warnings.warn('Zero stopping requires %s available!' % | 65 | warnings.warn('Zero stopping requires %s available!' % |
| 66 | (self.monitor), RuntimeWarning) | 66 | (self.monitor), RuntimeWarning) |
| 67 | 67 | ||
| 68 | if self.monitor_op(current, self.thresh): | 68 | if self.monitor_op(current, self.thresh): |
| 69 | self.best = current | 69 | self.best = current |
| 70 | self.model.stop_training = True | 70 | self.model.stop_training = True |
| 71 | 71 | ||
| 72 | #batch_size = 16 | 72 | #batch_size = 16 |
| 73 | #original_dim = 784 | 73 | #original_dim = 784 |
| 74 | #latent_dim = 2 | 74 | #latent_dim = 2 |
| 75 | #intermediate_dim = 128 | 75 | #intermediate_dim = 128 |
| 76 | #epsilon_std = 0.01 | 76 | #epsilon_std = 0.01 |
| 77 | #nb_epoch = 40 | 77 | #nb_epoch = 40 |
| 78 | 78 | ||
| 79 | 79 | ||
| 80 | 80 | ||
| 81 | 81 | ||
| 82 | def train_vae(x_train,x_dev,x_test,y_train=None,y_dev=None,y_test=None,hidden_size=80,latent_dim=12,batch_size=8,nb_epochs=10,sgd="rmsprop",input_activation = "relu",output_activation = "sigmoid",epsilon_std=0.01): | 82 | def train_vae(x_train,x_dev,x_test,y_train=None,y_dev=None,y_test=None,hidden_size=80,latent_dim=12,batch_size=8,nb_epochs=10,sgd="rmsprop",input_activation = "relu",output_activation = "sigmoid",epsilon_std=0.01): |
| 83 | 83 | ||
| 84 | 84 | ||
| 85 | 85 | ||
| 86 | def sampling(args): | 86 | def sampling(args): |
| 87 | z_mean, z_log_std = args | 87 | z_mean, z_log_std = args |
| 88 | epsilon = K.random_normal(shape=(batch_size, latent_dim), | 88 | epsilon = K.random_normal(shape=(batch_size, latent_dim), |
| 89 | mean=0., std=epsilon_std) | 89 | mean=0., std=epsilon_std) |
| 90 | return z_mean + K.exp(z_log_std) * epsilon | 90 | return z_mean + K.exp(z_log_std) * epsilon |
| 91 | 91 | ||
| 92 | def vae_loss(x, x_decoded_mean): | 92 | def vae_loss(x, x_decoded_mean): |
| 93 | xent_loss = objectives.binary_crossentropy(x, x_decoded_mean) | 93 | xent_loss = objectives.binary_crossentropy(x, x_decoded_mean) |
| 94 | kl_loss = - 0.5 * K.mean(1 + z_log_std - K.square(z_mean) - K.exp(z_log_std), axis=-1) | 94 | kl_loss = - 0.5 * K.mean(1 + z_log_std - K.square(z_mean) - K.exp(z_log_std), axis=-1) |
| 95 | return xent_loss + kl_loss | 95 | return xent_loss + kl_loss |
| 96 | 96 | ||
| 97 | original_dim = x_train.shape[1] | 97 | original_dim = x_train.shape[1] |
| 98 | 98 | ||
| 99 | 99 | ||
| 100 | x = Input(batch_shape=(batch_size, original_dim)) | 100 | x = Input(batch_shape=(batch_size, original_dim)) |
| 101 | h = Dense(hidden_size, activation=input_activation)(x) | 101 | h = Dense(hidden_size, activation=input_activation)(x) |
| 102 | z_mean = Dense(latent_dim)(h) | 102 | z_mean = Dense(latent_dim)(h) |
| 103 | z_log_std = Dense(latent_dim)(h) | 103 | z_log_std = Dense(latent_dim)(h) |
| 104 | 104 | ||
| 105 | 105 | ||
| 106 | # note that "output_shape" isn't necessary with the TensorFlow backend | 106 | # note that "output_shape" isn't necessary with the TensorFlow backend |
| 107 | # so you could write `Lambda(sampling)([z_mean, z_log_std])` | 107 | # so you could write `Lambda(sampling)([z_mean, z_log_std])` |
| 108 | z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_std]) | 108 | z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_std]) |
| 109 | 109 | ||
| 110 | # we instantiate these layers separately so as to reuse them later | 110 | # we instantiate these layers separately so as to reuse them later |
| 111 | decoder_h = Dense(hidden_size, activation=input_activation) | 111 | decoder_h = Dense(hidden_size, activation=input_activation) |
| 112 | decoder_mean = Dense(original_dim, activation=output_activation) | 112 | decoder_mean = Dense(original_dim, activation=output_activation) |
| 113 | h_decoded = decoder_h(z) | 113 | h_decoded = decoder_h(z) |
| 114 | x_decoded_mean = decoder_mean(h_decoded) | 114 | x_decoded_mean = decoder_mean(h_decoded) |
| 115 | 115 | ||
| 116 | 116 | ||
| 117 | vae = Model(x, x_decoded_mean) | 117 | vae = Model(x, x_decoded_mean) |
| 118 | vae.compile(optimizer=sgd, loss=vae_loss) | 118 | vae.compile(optimizer=sgd, loss=vae_loss) |
| 119 | 119 | ||
| 120 | # train the VAE on MNIST digits | 120 | # train the VAE on MNIST digits |
| 121 | if y_train is None or y_dev is None or y_test is None : | 121 | if y_train is None or y_dev is None or y_test is None : |
| 122 | y_train = x_train | 122 | y_train = x_train |
| 123 | y_dev = x_dev | 123 | y_dev = x_dev |
| 124 | y_test = x_test | 124 | y_test = x_test |
| 125 | 125 | ||
| 126 | vae.fit(x_train, y_train, | 126 | vae.fit(x_train, y_train, |
| 127 | shuffle=True, | 127 | shuffle=True, |
| 128 | nb_epoch=nb_epochs, | 128 | nb_epoch=nb_epochs, |
| 129 | verbose = 1, | 129 | verbose = 1, |
| 130 | batch_size=batch_size, | 130 | batch_size=batch_size, |
| 131 | validation_data=(x_dev, y_dev), | 131 | validation_data=(x_dev, y_dev) |
| 132 | callbacks = [ZeroStopping(monitor='val_loss', thresh=0, verbose=0, mode='min')] | 132 | #callbacks = [ZeroStopping(monitor='val_loss', thresh=0, verbose=0, mode='min')] |
| 133 | ) | 133 | ) |
| 134 | 134 | ||
| 135 | # build a model to project inputs on the latent space | 135 | # build a model to project inputs on the latent space |
| 136 | encoder = Model(x, z_mean) | 136 | encoder = Model(x, z_mean) |
| 137 | pred_train = encoder.predict(x_train, batch_size=batch_size) | 137 | pred_train = encoder.predict(x_train, batch_size=batch_size) |
| 138 | pred_dev = encoder.predict(x_dev, batch_size=batch_size) | 138 | pred_dev = encoder.predict(x_dev, batch_size=batch_size) |
| 139 | pred_test = encoder.predict(x_test,batch_size=batch_size) | 139 | pred_test = encoder.predict(x_test,batch_size=batch_size) |
| 140 | return [ [ pred_train, pred_dev, pred_test ] ] | 140 | return [ [ pred_train, pred_dev, pred_test ] ] |
| 141 | # display a 2D plot of the digit classes in the latent space | 141 | # display a 2D plot of the digit classes in the latent space |
| 142 | #x_test_encoded = encoder.predict(x_test, batch_size=batch_size) | 142 | #x_test_encoded = encoder.predict(x_test, batch_size=batch_size) |
| 143 | # build a digit generator that can sample from the learned distribution | 143 | # build a digit generator that can sample from the learned distribution |
| 144 | #decoder_input = Input(shape=(latent_dim,)) | 144 | #decoder_input = Input(shape=(latent_dim,)) |
| 145 | #_h_decoded = decoder_h(decoder_input) | 145 | #_h_decoded = decoder_h(decoder_input) |
| 146 | #_x_decoded_mean = decoder_mean(_h_decoded) | 146 | #_x_decoded_mean = decoder_mean(_h_decoded) |
| 147 | #generator = Model(decoder_input, _x_decoded_mean) | 147 | #generator = Model(decoder_input, _x_decoded_mean) |
| 148 | #x_decoded = generator.predict(z_sample) | 148 | #x_decoded = generator.predict(z_sample) |
| 149 | 149 | ||
| 150 | 150 |