Commit 7db73861ffbab3f3f51b17188d8894a512b36264
1 parent
b6d0165d16
Exists in
master
add vae et mmf
Showing 13 changed files with 1084 additions and 44 deletions Side-by-side Diff
LDA/00-mmf_make_features.py
| 1 | +import sys | |
| 2 | +import os | |
| 3 | + | |
| 4 | +import pandas | |
| 5 | +import numpy | |
| 6 | +import shelve | |
| 7 | + | |
| 8 | +from sklearn.preprocessing import LabelBinarizer | |
| 9 | + | |
| 10 | +from utils import select_mmf as select | |
| 11 | + | |
| 12 | +input_dir = sys.argv[1] # Dossier de premire niveau contient ASR et TRS | |
| 13 | +level = sys.argv[2] # taille de LDA ( -5) voulu | |
| 14 | + | |
| 15 | +lb=LabelBinarizer() | |
| 16 | +#y_train=lb.fit_transform([utils.select(ligneid) for ligneid in origin_corps["LABEL"]["TRAIN"]]) | |
| 17 | + | |
| 18 | + | |
| 19 | +data = shelve.open("{}/mmf_{}.shelve".format(input_dir,level)) | |
| 20 | +data["LABEL"]= {"LDA":{}} | |
| 21 | +for mod in ["ASR", "TRS" ] | |
| 22 | + train = pandas.read_table("{}/{}/train_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | |
| 23 | + dev = pandas.read_table("{}/{}/dev_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | |
| 24 | + test = pandas.read_table("{}/{}/test_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | |
| 25 | + | |
| 26 | + y_train = train.iloc[:,0].apply(select) | |
| 27 | + y_dev = dev.iloc[:,0].apply(select) | |
| 28 | + y_test = test.iloc[:,0].apply(select) | |
| 29 | + lb.fit(y_train) | |
| 30 | + data["LABEL"][mod]={"TRAIN":lb.transform(y_train),"DEV":lb.transform(y_dev), "TEST": lb.transform(y_test)} | |
| 31 | + | |
| 32 | + data["LDA"][mod]={} | |
| 33 | + data["LDA"][mod]["TRAIN"]=train.iloc[:,1:].values | |
| 34 | + data["LDA"][mod]["DEV"]=dev.iloc[:,1:].values | |
| 35 | + data["LDA"][mod]["TEST"]=test.iloc[:,1:].values | |
| 36 | + | |
| 37 | +data.sync() | |
| 38 | +data.close() |
LDA/02-lda.py
| ... | ... | @@ -12,10 +12,11 @@ |
| 12 | 12 | import dill |
| 13 | 13 | from tinydb import TinyDB, where, Query |
| 14 | 14 | import time |
| 15 | +from joblib import Parallel, delayed | |
| 15 | 16 | |
| 16 | 17 | def calc_perp(models,train): |
| 17 | 18 | |
| 18 | - | |
| 19 | + | |
| 19 | 20 | stop_words=models[1] |
| 20 | 21 | name = models[0] |
| 21 | 22 | |
| ... | ... | @@ -45,7 +46,8 @@ |
| 45 | 46 | def train_lda(out_dir,train,size,it,sw_size,alpha,eta,passes,chunk): |
| 46 | 47 | name = "s{}_it{}_sw{}_a{}_e{}_p{}_c{}".format(size,it,sw_size,alpha,eta,passes,chunk) |
| 47 | 48 | logging.warning(name) |
| 48 | - if os.path.isfile(out_dir+"/"+name+".dill"): | |
| 49 | + deep_out_dir = out_dir+"/"+name | |
| 50 | + if os.path.isdir(deep_out_dir): | |
| 49 | 51 | logging.error(name+" already done") |
| 50 | 52 | return |
| 51 | 53 | logging.warning(name+" to be done") |
| ... | ... | @@ -54,7 +56,6 @@ |
| 54 | 56 | asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] |
| 55 | 57 | trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] |
| 56 | 58 | stop_words=set(asr_sw) | set(trs_sw) |
| 57 | - stop_words=[ x.strip() for x in open("french.txt").readlines() ] | |
| 58 | 59 | |
| 59 | 60 | logging.warning("TRS to be done") |
| 60 | 61 | |
| 61 | 62 | |
| 62 | 63 | |
| 63 | 64 | |
| ... | ... | @@ -68,19 +69,42 @@ |
| 68 | 69 | asr_probs = [] |
| 69 | 70 | for line in lda_asr.expElogbeta: |
| 70 | 71 | nline = line / np.sum(line) |
| 71 | - asr_probs.append( str(x) for x in nline) | |
| 72 | + asr_probs.append([ str(x) for x in nline]) | |
| 72 | 73 | trs_probs = [] |
| 73 | 74 | for line in lda_trs.expElogbeta: |
| 74 | 75 | nline = line / np.sum(line) |
| 75 | - trs_probs.append( str(x) for x in nline) | |
| 76 | + trs_probs.append([str(x) for x in nline]) | |
| 76 | 77 | |
| 77 | 78 | K = lda_asr.num_topics |
| 78 | 79 | topicWordProbMat_asr = lda_asr.print_topics(K,10) |
| 79 | 80 | |
| 80 | 81 | K = lda_trs.num_topics |
| 81 | 82 | topicWordProbMat_trs = lda_trs.print_topics(K,10) |
| 83 | + os.mkdir(deep_out_dir) | |
| 84 | + dill.dump([x for x in stop_words],open(deep_out_dir+"/stopwords.dill","w")) | |
| 85 | + lda_asr.save(deep_out_dir+"/lda_asr.model") | |
| 86 | + lda_trs.save(deep_out_dir+"/lda_trs.model") | |
| 87 | + dill.dump([x for x in asr_probs],open(deep_out_dir+"/lda_asr_probs.dill","w")) | |
| 88 | + dill.dump([x for x in trs_probs],open(deep_out_dir+"/lda_trs_probs.dill","w")) | |
| 89 | + | |
| 82 | 90 | return [name, stop_words, lda_asr , asr_probs , topicWordProbMat_asr, lda_trs, trs_probs, topicWordProbMat_trs] |
| 83 | 91 | |
| 92 | +def train_one(name,train,s,i,sw,a,e,p,c): | |
| 93 | + st=time.time() | |
| 94 | + logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]])) | |
| 95 | + models = train_lda(name,train,s,i,sw,a,e,p,c) | |
| 96 | + if models: | |
| 97 | + m = calc_perp(models,train) | |
| 98 | + #dill.dump(models,open("{}/{}.dill".format(name,models[0]),"wb")) | |
| 99 | + else : | |
| 100 | + m = None | |
| 101 | + e = time.time() | |
| 102 | + logging.warning("fin en : {}".format(e-st)) | |
| 103 | + return m | |
| 104 | + | |
| 105 | + | |
| 106 | + | |
| 107 | + | |
| 84 | 108 | if __name__ == "__main__": |
| 85 | 109 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) |
| 86 | 110 | |
| ... | ... | @@ -109,6 +133,8 @@ |
| 109 | 133 | db = TinyDB(db_path) |
| 110 | 134 | nb_model = len(passes) * len(chunk) * len(it) * len(sw_size) * len(alpha) * len(eta) * len(size) |
| 111 | 135 | logging.warning(" hey will train {} models ".format(nb_model)) |
| 136 | + | |
| 137 | + args_list=[] | |
| 112 | 138 | for p in passes: |
| 113 | 139 | for c in chunk: |
| 114 | 140 | for i in it : |
| ... | ... | @@ -116,13 +142,9 @@ |
| 116 | 142 | for a in alpha: |
| 117 | 143 | for e in eta: |
| 118 | 144 | for s in size: |
| 119 | - st=time.time() | |
| 120 | - logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]])) | |
| 121 | - models = train_lda(name,train,s,i,sw,a,e,p,c) | |
| 122 | - if models: | |
| 123 | - m = calc_perp(models,train) | |
| 124 | - dill.dump(models,open("{}/{}.dill".format(name,models[0]),"wb")) | |
| 125 | - db.insert(m) | |
| 126 | - e = time.time() | |
| 127 | - logging.warning("fin en : {}".format(e-st)) | |
| 145 | + args_list.append((name,train,s,i,sw,a,e,p,c)) | |
| 146 | + res_list= Parallel(n_jobs=15)(delayed(train_one)(*args) for args in args_list) | |
| 147 | + for m in res_list : | |
| 148 | + db.insert(m) | |
| 149 | + |
LDA/03-mono_perplex.py
| ... | ... | @@ -52,7 +52,7 @@ |
| 52 | 52 | input_dir = sys.argv[2] |
| 53 | 53 | db_path = sys.argv[3] |
| 54 | 54 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) |
| 55 | - folders = glob.glob("{}/*".format(input_dir)) | |
| 55 | + folders = glob.glob("{}/s*".format(input_dir)) | |
| 56 | 56 | |
| 57 | 57 | #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir))) |
| 58 | 58 | train = shelve.open(input_shelve) |
LDA/03-perplex.py
| ... | ... | @@ -22,40 +22,43 @@ |
| 22 | 22 | |
| 23 | 23 | |
| 24 | 24 | def calc_perp(params): |
| 25 | - in_dir,train = params | |
| 26 | - name = in_dir.split("/")[-1] | |
| 27 | - # s40_it1_sw50_a0.01_e0.1_p6_c1000 | |
| 25 | + try: | |
| 26 | + in_dir,train = params | |
| 27 | + name = in_dir.split("/")[-1] | |
| 28 | + # s40_it1_sw50_a0.01_e0.1_p6_c1000 | |
| 28 | 29 | |
| 29 | - entry = Query() | |
| 30 | - value=db.search(entry.name == name) | |
| 31 | - if len(value) > 0 : | |
| 32 | - logging.warning("{} already done".format(name)) | |
| 33 | - return | |
| 30 | + entry = Query() | |
| 31 | + value=db.search(entry.name == name) | |
| 32 | + if len(value) > 0 : | |
| 33 | + logging.warning("{} already done".format(name)) | |
| 34 | + return | |
| 34 | 35 | |
| 35 | - sw_size = int(name.split("_")[2][2:]) | |
| 36 | + sw_size = int(name.split("_")[2][2:]) | |
| 36 | 37 | |
| 37 | - logging.warning(" go {} ".format(name)) | |
| 38 | + logging.warning(" go {} ".format(name)) | |
| 38 | 39 | |
| 39 | 40 | |
| 40 | - logging.warning("Redo Vocab and stop") | |
| 41 | - asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) | |
| 42 | - trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) | |
| 43 | - asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] | |
| 44 | - trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] | |
| 45 | - stop_words=set(asr_sw) | set(trs_sw) | |
| 41 | + logging.warning("Redo Vocab and stop") | |
| 42 | + asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) | |
| 43 | + trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) | |
| 44 | + asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] | |
| 45 | + trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] | |
| 46 | + stop_words=set(asr_sw) | set(trs_sw) | |
| 46 | 47 | |
| 47 | - logging.warning("TRS to be done") | |
| 48 | - | |
| 49 | - dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] | |
| 50 | - lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir)) | |
| 51 | - perp_trs = lda_trs.log_perplexity(dev_trs) | |
| 52 | - logging.warning("ASR to be done") | |
| 53 | - dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] | |
| 54 | - lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir)) | |
| 55 | - perp_asr = lda_asr.log_perplexity(dev_asr) | |
| 56 | - logging.warning("ASR saving") | |
| 57 | - res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs} | |
| 58 | - return res_dict | |
| 48 | + logging.warning("TRS to be done") | |
| 49 | + | |
| 50 | + dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] | |
| 51 | + lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir)) | |
| 52 | + perp_trs = lda_trs.log_perplexity(dev_trs) | |
| 53 | + logging.warning("ASR to be done") | |
| 54 | + dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] | |
| 55 | + lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir)) | |
| 56 | + perp_asr = lda_asr.log_perplexity(dev_asr) | |
| 57 | + logging.warning("ASR saving") | |
| 58 | + res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs} | |
| 59 | + return res_dict | |
| 60 | + except : | |
| 61 | + return { "name" : name } | |
| 59 | 62 | |
| 60 | 63 | if __name__ == "__main__": |
| 61 | 64 | input_shelve = sys.argv[1] |
LDA/04a-mmdf.py
| 1 | + | |
| 2 | +# coding: utf-8 | |
| 3 | + | |
| 4 | +# In[29]: | |
| 5 | + | |
| 6 | +# Import | |
| 7 | +import itertools | |
| 8 | +import shelve | |
| 9 | +import pickle | |
| 10 | +import numpy | |
| 11 | +import scipy | |
| 12 | +from scipy import sparse | |
| 13 | +import scipy.sparse | |
| 14 | +import scipy.io | |
| 15 | +from mlp import * | |
| 16 | +import mlp | |
| 17 | +import sys | |
| 18 | +import utils | |
| 19 | +import dill | |
| 20 | +from collections import Counter | |
| 21 | +from gensim.models import LdaModel | |
| 22 | + | |
| 23 | + | |
| 24 | + | |
| 25 | +# In[3]: | |
| 26 | + | |
| 27 | +#30_50_50_150_0.0001 | |
| 28 | + | |
| 29 | +# In[4]: | |
| 30 | + | |
| 31 | +#db=shelve.open("SPELIKE_MLP_DB.shelve",writeback=True) | |
| 32 | +origin_corps=shelve.open("{}".format(sys.argv[2])) | |
| 33 | +in_dir = sys.argv[1] | |
| 34 | + | |
| 35 | + | |
| 36 | +out_db=shelve.open("{}/mlp_scores.shelve".format(in_dir),writeback=True) | |
| 37 | + | |
| 38 | +mlp_h = [ 250, 250 ] | |
| 39 | +mlp_loss = "categorical_crossentropy" | |
| 40 | +mlp_dropouts = [0.25]* len(mlp_h) | |
| 41 | +mlp_sgd = Adam(lr=0.0001) | |
| 42 | +mlp_epochs = 3000 | |
| 43 | +mlp_batch_size = 1 | |
| 44 | +mlp_input_activation = "relu" | |
| 45 | +mlp_output_activation="softmax" | |
| 46 | + | |
| 47 | +ress = [] | |
| 48 | +for key in ["TRS", "ASR"] : | |
| 49 | + | |
| 50 | + res=mlp.train_mlp(origin_corps["LDA"][key]["TRAIN"],origin_corps["LABEL"][key]["TRAIN"], | |
| 51 | + origin_corps["LDA"][key]["DEV"],origin_corps["LABEL"][key]["DEV"], | |
| 52 | + origin_corps["LDA"][key]["TEST"],origin_corps["LABEL"][key]["TEST"], | |
| 53 | + mlp_h,dropouts=mlp_dropouts,sgd=mlp_sgd, | |
| 54 | + epochs=mlp_epochs, | |
| 55 | + batch_size=mlp_batch_size, | |
| 56 | + save_pred=False,keep_histo=False, | |
| 57 | + loss="categorical_crossentropy",fit_verbose=0) | |
| 58 | + arg_best=[] | |
| 59 | + dev_best=[] | |
| 60 | + arg_best.append(numpy.argmax(res[1])) | |
| 61 | + dev_best.append(res[1][arg_best[-1]]) | |
| 62 | + res[1][arg_best[-1]]=0 | |
| 63 | + arg_best.append(numpy.argmax(res[1])) | |
| 64 | + dev_best.append(res[1][arg_best[-1]]) | |
| 65 | + res[1][arg_best[-1]]=0 | |
| 66 | + arg_best.append(numpy.argmax(res[1])) | |
| 67 | + dev_best.append(res[1][arg_best[-1]]) | |
| 68 | + res[1][arg_best[-1]]=0 | |
| 69 | + arg_best.append(numpy.argmax(res[1])) | |
| 70 | + dev_best.append(res[1][arg_best[-1]]) | |
| 71 | + res[1][arg_best[-1]]=0 | |
| 72 | + arg_best.append(numpy.argmax(res[1])) | |
| 73 | + dev_best.append(res[1][arg_best[-1]]) | |
| 74 | + res[1][arg_best[-1]]=0 | |
| 75 | + arg_best.append(numpy.argmax(res[1])) | |
| 76 | + dev_best.append(res[1][arg_best[-1]]) | |
| 77 | + res[1][arg_best[-1]]=0 | |
| 78 | + arg_best.append(numpy.argmax(res[1])) | |
| 79 | + dev_best.append(res[1][arg_best[-1]]) | |
| 80 | + res[1][arg_best[-1]]=0 | |
| 81 | + arg_best.append(numpy.argmax(res[1])) | |
| 82 | + dev_best.append(res[1][arg_best[-1]]) | |
| 83 | + res[1][arg_best[-1]]=0 | |
| 84 | + arg_best.append(numpy.argmax(res[1])) | |
| 85 | + dev_best.append(res[1][arg_best[-1]]) | |
| 86 | + res[1][arg_best[-1]]=0 | |
| 87 | + arg_best.append(numpy.argmax(res[1])) | |
| 88 | + dev_best.append(res[1][arg_best[-1]]) | |
| 89 | + res[1][arg_best[-1]]=0 | |
| 90 | + arg_best.append(numpy.argmax(res[1])) | |
| 91 | + dev_best.append(res[1][arg_best[-1]]) | |
| 92 | + res[1][arg_best[-1]]=0 | |
| 93 | + arg_best.append(numpy.argmax(res[1])) | |
| 94 | + dev_best.append(res[1][arg_best[-1]]) | |
| 95 | + res[1][arg_best[-1]]=0 | |
| 96 | + | |
| 97 | + | |
| 98 | + | |
| 99 | + | |
| 100 | + test_best =[ res[2][x] for x in arg_best ] | |
| 101 | + test_max = numpy.max(res[2]) | |
| 102 | + out_db[key]=(res,(dev_best,test_best,test_max)) | |
| 103 | + ress.append((key,dev_best,test_best,test_max)) | |
| 104 | + | |
| 105 | +for el in ress : | |
| 106 | + print el | |
| 107 | +out_db.close() | |
| 108 | +origin_corps.close() |
LDA/04b-mmf_mini_ae.py
| 1 | + | |
| 2 | +# coding: utf-8 | |
| 3 | + | |
| 4 | +# In[2]: | |
| 5 | + | |
| 6 | +# Import | |
| 7 | +import gensim | |
| 8 | +from scipy import sparse | |
| 9 | +import itertools | |
| 10 | +from sklearn import preprocessing | |
| 11 | +from keras.models import Sequential | |
| 12 | +from keras.optimizers import SGD,Adam | |
| 13 | +from mlp import * | |
| 14 | +import sklearn.metrics | |
| 15 | +import shelve | |
| 16 | +import pickle | |
| 17 | +from utils import * | |
| 18 | +import sys | |
| 19 | +import os | |
| 20 | +import json | |
| 21 | +# In[4]: | |
| 22 | + | |
| 23 | +infer_model=shelve.open("{}".format(sys.argv[2])) | |
| 24 | +in_dir = sys.argv[1] | |
| 25 | +#['ASR', 'TRS', 'LABEL'] | |
| 26 | +# In[6]: | |
| 27 | + | |
| 28 | + | |
| 29 | +hidden_size=[ 100 , 50, 100 ] | |
| 30 | +input_activation="tanh" | |
| 31 | +output_activation="tanh" | |
| 32 | +loss="mse" | |
| 33 | +epochs=1000 | |
| 34 | +batch=1 | |
| 35 | +patience=60 | |
| 36 | +do_do=[False] | |
| 37 | +sgd = Adam(lr=0.000001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | |
| 38 | + | |
| 39 | + | |
| 40 | + | |
| 41 | +mlp_h = [ 150 ,150 ,150 ] | |
| 42 | +mlp_loss = "categorical_crossentropy" | |
| 43 | +mlp_dropouts = [] | |
| 44 | +mlp_sgd = Adam(lr=0.0001) | |
| 45 | +mlp_epochs = 2000 | |
| 46 | +mlp_batch_size = 8 | |
| 47 | +mlp_output_activation="softmax" | |
| 48 | + | |
| 49 | +try : | |
| 50 | + sgd_repr=sgd.get_config()["name"] | |
| 51 | +except AttributeError : | |
| 52 | + sgd_repr=sgd | |
| 53 | + | |
| 54 | +try : | |
| 55 | + mlp_sgd_repr=mlp_sgd.get_config()["name"] | |
| 56 | +except AttributeError : | |
| 57 | + mlp_sgd_repr=mlp_sgd | |
| 58 | + | |
| 59 | + | |
| 60 | +params={ "h1" : "_".join([ str(x) for x in hidden_size ]), | |
| 61 | + "inside_activation" : input_activation, | |
| 62 | + "output_activation" : output_activation, | |
| 63 | + "do_dropout": "_".join([str(x) for x in do_do]), | |
| 64 | + "loss" : loss, | |
| 65 | + "epochs" : epochs , | |
| 66 | + "batch_size" : batch, | |
| 67 | + "patience" : patience, | |
| 68 | + "sgd" : sgd_repr, | |
| 69 | + "mlp_h ": "_".join([str(x) for x in mlp_h]), | |
| 70 | + "mlp_loss ": mlp_loss, | |
| 71 | + "mlp_dropouts ": "_".join([str(x) for x in mlp_dropouts]), | |
| 72 | + "mlp_sgd ": mlp_sgd_repr, | |
| 73 | + "mlp_epochs ": mlp_epochs, | |
| 74 | + "mlp_batch_size ": mlp_batch_size, | |
| 75 | + "mlp_output" : mlp_output_activation | |
| 76 | + } | |
| 77 | +name = "_".join([ str(x) for x in params.values()]) | |
| 78 | +try: | |
| 79 | + os.mkdir("{}/{}".format(in_dir,name)) | |
| 80 | +except: | |
| 81 | + pass | |
| 82 | +db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True) | |
| 83 | +db["params"] = params | |
| 84 | +db["LABEL"]=infer_model["LABEL"] | |
| 85 | +# | |
| 86 | +json.dump(params, | |
| 87 | + open("{}/{}/ae_model.json".format(in_dir,name),"w"), | |
| 88 | + indent=4) | |
| 89 | + | |
| 90 | +keys = ["ASR","TRS"] | |
| 91 | + | |
| 92 | +db["AE"] = {} | |
| 93 | +db["LDA"] = {} | |
| 94 | +for mod in keys : | |
| 95 | + print mod | |
| 96 | + db["LDA"][mod] = train_mlp(infer_model["LDA"][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"], | |
| 97 | + infer_model["LDA"][mod]["DEV"],infer_model["LABEL"][mod]["DEV"], | |
| 98 | + infer_model["LDA"][mod]["TEST"],infer_model["LABEL"][mod]["TEST"], | |
| 99 | + mlp_h ,sgd=mlp_sgd, | |
| 100 | + epochs=mlp_epochs, | |
| 101 | + batch_size=mlp_batch_size, | |
| 102 | + input_activation=input_activation, | |
| 103 | + output_activation=mlp_output_activation, | |
| 104 | + dropouts=mlp_dropouts, | |
| 105 | + fit_verbose=0) | |
| 106 | + | |
| 107 | + res=train_ae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"], | |
| 108 | + hidden_size,patience = params["patience"],sgd=sgd, | |
| 109 | + dropouts=do_do,input_activation=input_activation,output_activation=output_activation, | |
| 110 | + loss=loss,epochs=epochs,batch_size=batch,verbose=0) | |
| 111 | + mlp_res_list=[] | |
| 112 | + for layer in res : | |
| 113 | + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], | |
| 114 | + layer[1],infer_model["LABEL"][mod]["DEV"], | |
| 115 | + layer[2],infer_model["LABEL"][mod]["TEST"], | |
| 116 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, | |
| 117 | + output_activation=mlp_output_activation, | |
| 118 | + input_activation=input_activation, | |
| 119 | + batch_size=mlp_batch_size,fit_verbose=0)) | |
| 120 | + db["AE"][mod]=mlp_res_list | |
| 121 | + | |
| 122 | +mod = "ASR" | |
| 123 | +mod2= "TRS" | |
| 124 | +mlp_res_list=[] | |
| 125 | + | |
| 126 | +res = train_ae(infer_model["LDA"][mod]["TRAIN"], | |
| 127 | + infer_model["LDA"][mod]["DEV"], | |
| 128 | + infer_model["LDA"][mod]["TEST"], | |
| 129 | + hidden_size,dropouts=do_do,patience = params["patience"], | |
| 130 | + sgd=sgd,input_activation=input_activation,output_activation=output_activation,loss=loss,epochs=epochs, | |
| 131 | + batch_size=batch, | |
| 132 | + y_train=infer_model["LDA"][mod]["TRAIN"], | |
| 133 | + y_dev=infer_model["LDA"][mod2]["DEV"], | |
| 134 | + y_test=infer_model["LDA"][mod2]["TEST"]) | |
| 135 | + | |
| 136 | +for layer in res : | |
| 137 | + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], | |
| 138 | + layer[1],infer_model["LABEL"][mod]["DEV"], | |
| 139 | + layer[2],infer_model["LABEL"][mod]["TEST"], | |
| 140 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, | |
| 141 | + output_activation=mlp_output_activation, | |
| 142 | + input_activation=input_activation, | |
| 143 | + batch_size=mlp_batch_size,fit_verbose=0)) | |
| 144 | + | |
| 145 | +db["AE"]["SPE"] = mlp_res_list | |
| 146 | + | |
| 147 | +db.sync() | |
| 148 | +db.close() |
LDA/04c-mmf_sae.py
| 1 | + | |
| 2 | +# coding: utf-8 | |
| 3 | + | |
| 4 | +# In[2]: | |
| 5 | + | |
| 6 | +# Import | |
| 7 | +import gensim | |
| 8 | +from scipy import sparse | |
| 9 | +import itertools | |
| 10 | +from sklearn import preprocessing | |
| 11 | +from keras.models import Sequential | |
| 12 | +from keras.optimizers import SGD,Adam | |
| 13 | +from mlp import * | |
| 14 | +import mlp | |
| 15 | +import sklearn.metrics | |
| 16 | +import shelve | |
| 17 | +import pickle | |
| 18 | +from utils import * | |
| 19 | +import sys | |
| 20 | +import os | |
| 21 | +import json | |
| 22 | +# In[4]: | |
| 23 | + | |
| 24 | +infer_model=shelve.open("{}".format(sys.argv[2])) | |
| 25 | +in_dir = sys.argv[1] | |
| 26 | +#['ASR', 'TRS', 'LABEL'] | |
| 27 | +# In[6]: | |
| 28 | + | |
| 29 | + | |
| 30 | +hidden_size=[ 100, 80, 50 , 20 ] | |
| 31 | +input_activation="relu" | |
| 32 | +output_activation="relu" | |
| 33 | +loss="mse" | |
| 34 | +epochs=3000 | |
| 35 | +batch=1 | |
| 36 | +patience=20 | |
| 37 | +do_do=[ 0 ] * len(hidden_size) | |
| 38 | +sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | |
| 39 | +try : | |
| 40 | + sgd_repr=sgd.get_config()["name"] | |
| 41 | +except AttributeError : | |
| 42 | + sgd_repr=sgd | |
| 43 | + | |
| 44 | +params={ "h1" : "_".join([str(x) for x in hidden_size]), | |
| 45 | + "inside_activation" : input_activation, | |
| 46 | + "out_activation" : output_activation, | |
| 47 | + "do_dropout": "_".join([str(x) for x in do_do]), | |
| 48 | + "loss" : loss, | |
| 49 | + "epochs" : epochs , | |
| 50 | + "batch_size" : batch, | |
| 51 | + "patience" : patience, | |
| 52 | + "sgd" : sgd_repr} | |
| 53 | +name = "_".join([ str(x) for x in params.values()]) | |
| 54 | +try: | |
| 55 | + os.mkdir("{}/SAE_{}".format(in_dir,name)) | |
| 56 | +except: | |
| 57 | + pass | |
| 58 | +db = shelve.open("{}/SAE_{}/ae_model.shelve".format(in_dir,name),writeback=True) | |
| 59 | +# | |
| 60 | +json.dump(params, | |
| 61 | + open("{}/SAE_{}/ae_model.json".format(in_dir,name),"w"), | |
| 62 | + indent=4) | |
| 63 | + | |
| 64 | +keys = ["ASR","TRS"] | |
| 65 | + | |
| 66 | +mlp_h = [ 150 , 300 ] | |
| 67 | +mlp_loss ="categorical_crossentropy" | |
| 68 | +mlp_dropouts = [0,0,0,0] | |
| 69 | +mlp_sgd = Adam(0.001) | |
| 70 | +mlp_epochs = 2000 | |
| 71 | +mlp_batch_size = 8 | |
| 72 | + | |
| 73 | +db["SAE"] = {} | |
| 74 | + | |
| 75 | +db["SAEFT"] = {} | |
| 76 | +for mod in keys : | |
| 77 | + print "MODE ", mod | |
| 78 | + res_tuple=train_sae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"], | |
| 79 | + infer_model["LDA"][mod]["TEST"], | |
| 80 | + hidden_size,dropouts=do_do, | |
| 81 | + patience = params["patience"],sgd=sgd,input_activation="tanh", | |
| 82 | + output_activation="tanh",loss=loss,epochs=epochs, | |
| 83 | + batch_size=batch,verbose=0) | |
| 84 | + #print len(res), [len(x) for x in res[0]], [ len(x) for x in res[1]] | |
| 85 | + for name , levels in zip(["SAE","SAEFT"],res_tuple): | |
| 86 | + print "NAME", name | |
| 87 | + mlp_res_by_level = [] | |
| 88 | + for res in levels: | |
| 89 | + mlp_res_list=[] | |
| 90 | + for nb,layer in enumerate(res) : | |
| 91 | + print "layer NB",nb | |
| 92 | + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], | |
| 93 | + layer[1],infer_model["LABEL"][mod]["DEV"], | |
| 94 | + layer[2],infer_model["LABEL"][mod]["TEST"], | |
| 95 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, | |
| 96 | + sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size, | |
| 97 | + fit_verbose=0)) | |
| 98 | + mlp_res_by_level.append(mlp_res_list) | |
| 99 | + db[name][mod]=mlp_res_by_level | |
| 100 | + | |
| 101 | +mod = "ASR" | |
| 102 | +mod2= "TRS" | |
| 103 | +print "mode SPE " | |
| 104 | +res_tuple = train_sae(infer_model["LDA"][mod]["TRAIN"], | |
| 105 | + infer_model["LDA"][mod]["DEV"], | |
| 106 | + infer_model["LDA"][mod]["TEST"], | |
| 107 | + hidden_size,dropouts=[0],patience=params["patience"], | |
| 108 | + sgd=sgd,input_activation=input_activation,output_activation=input_activation, | |
| 109 | + loss=loss,epochs=epochs,batch_size=batch, | |
| 110 | + y_train=infer_model["LDA"][mod2]["TRAIN"], | |
| 111 | + y_dev=infer_model["LDA"][mod2]["DEV"], | |
| 112 | + y_test=infer_model["LDA"][mod2]["TEST"]) | |
| 113 | + | |
| 114 | +for name , levels in zip(["SAE","SAEFT"],res_tuple): | |
| 115 | + mlp_res_by_level = [] | |
| 116 | + for res in levels : | |
| 117 | + mlp_res_list=[] | |
| 118 | + for layer in res : | |
| 119 | + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], | |
| 120 | + layer[1],infer_model["LABEL"][mod]["DEV"],layer[2], | |
| 121 | + infer_model["LABEL"][mod]["TEST"], | |
| 122 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, | |
| 123 | + sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size, | |
| 124 | + fit_verbose=0)) | |
| 125 | + mlp_res_by_level.append(mlp_res_list) | |
| 126 | + db[name]["SPE"] = mlp_res_by_level | |
| 127 | + | |
| 128 | +db.close() |
LDA/04d-mmf_dsae.py
| 1 | + | |
| 2 | +# coding: utf-8 | |
| 3 | + | |
| 4 | +# In[2]: | |
| 5 | + | |
| 6 | +# Import | |
| 7 | +import gensim | |
| 8 | +from scipy import sparse | |
| 9 | +import itertools | |
| 10 | +from sklearn import preprocessing | |
| 11 | +from keras.models import Sequential | |
| 12 | +from keras.optimizers import SGD,Adam | |
| 13 | +from mlp import * | |
| 14 | +import mlp | |
| 15 | +import sklearn.metrics | |
| 16 | +import shelve | |
| 17 | +import pickle | |
| 18 | +from utils import * | |
| 19 | +import sys | |
| 20 | +import os | |
| 21 | +import json | |
| 22 | +# In[4]: | |
| 23 | + | |
| 24 | +infer_model=shelve.open("{}".format(sys.argv[2])) | |
| 25 | +in_dir = sys.argv[1] | |
| 26 | +#['ASR', 'TRS', 'LABEL'] | |
| 27 | +# In[6]: | |
| 28 | + | |
| 29 | +# AE params | |
| 30 | +hidden_size=[ 100, 100 ] | |
| 31 | +input_activation="relu" | |
| 32 | +output_activation="relu" | |
| 33 | +loss="mse" | |
| 34 | +epochs= 1000 | |
| 35 | +batch_size=1 | |
| 36 | +patience=20 | |
| 37 | +do_do=[ 0.25 ] * len(hidden_size) | |
| 38 | +sgd = Adam(lr=0.00001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | |
| 39 | +try : | |
| 40 | + sgd_repr=sgd.get_config()["name"] | |
| 41 | +except AttributeError : | |
| 42 | + sgd_repr=sgd | |
| 43 | + | |
| 44 | +# Transforme : | |
| 45 | +trans_hidden_size=[ 300 , 300 ] | |
| 46 | +trans_input_activation="relu" | |
| 47 | +trans_output_activation="relu" | |
| 48 | +trans_loss="mse" | |
| 49 | +trans_epochs=1000 | |
| 50 | +trans_batch_size=8 | |
| 51 | +trans_patience=20 | |
| 52 | +trans_do=[ 0.25 ] * len(trans_hidden_size) | |
| 53 | +trans_sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | |
| 54 | +try : | |
| 55 | + trans_sgd_repr=trans_sgd.get_config()["name"] | |
| 56 | +except AttributeError : | |
| 57 | + trans_sgd_repr=trans_sgd | |
| 58 | + | |
| 59 | + | |
| 60 | + | |
| 61 | +ae={ "h1" : "_".join([str(x) for x in hidden_size]), | |
| 62 | + "inside_activation" : input_activation, | |
| 63 | + "out_activation" : output_activation, | |
| 64 | + "do_dropout": "_".join([str(x) for x in do_do]), | |
| 65 | + "loss" : loss, | |
| 66 | + "epochs" : epochs , | |
| 67 | + "batch_size" : batch_size, | |
| 68 | + "patience" : patience, | |
| 69 | + "sgd" : sgd_repr} | |
| 70 | +name = "_".join([ str(x) for x in ae.values()]) | |
| 71 | + | |
| 72 | +trans={ "h1" : "_".join([str(x) for x in trans_hidden_size]), | |
| 73 | + "inside_activation" : trans_input_activation, | |
| 74 | + "out_activation" : trans_output_activation, | |
| 75 | + "do_dropout": "_".join([str(x) for x in trans_do]), | |
| 76 | + "loss" : trans_loss, | |
| 77 | + "epochs" : trans_epochs , | |
| 78 | + "batch_size" : trans_batch_size, | |
| 79 | + "patience" : trans_patience, | |
| 80 | + "sgd" : trans_sgd_repr} | |
| 81 | + | |
| 82 | +mlp_h = [ 300 , 300 ] | |
| 83 | +mlp_loss ="categorical_crossentropy" | |
| 84 | +mlp_dropouts = [0,0,0,0] | |
| 85 | +mlp_sgd = Adam(0.0001) | |
| 86 | +mlp_epochs = 1000 | |
| 87 | +mlp_batch_size = 8 | |
| 88 | +mlp_input_activation = "relu" | |
| 89 | +mlp_output_activation = "softmax" | |
| 90 | + | |
| 91 | +try : | |
| 92 | + mlp_sgd_repr=mlp_sgd.get_config()["name"] | |
| 93 | +except AttributeError : | |
| 94 | + mlp_sgd_repr=mlp_sgd | |
| 95 | + | |
| 96 | + | |
| 97 | + | |
| 98 | +mlp={ "h1" : "_".join([str(x) for x in mlp_h ]), | |
| 99 | + "inside_activation" : mlp_input_activation, | |
| 100 | + "out_activation" : mlp_output_activation, | |
| 101 | + "do_dropout": "_".join([str(x) for x in mlp_dropouts]), | |
| 102 | + "loss" : mlp_loss, | |
| 103 | + "epochs" : mlp_epochs , | |
| 104 | + "batch_size" : mlp_batch_size, | |
| 105 | + "sgd" : mlp_sgd_repr} | |
| 106 | + | |
| 107 | +params = { "ae":ae, "trans":trans, "mlp":mlp} | |
| 108 | +try: | |
| 109 | + os.mkdir("{}/DSAE_{}".format(in_dir,name)) | |
| 110 | +except: | |
| 111 | + pass | |
| 112 | +db = shelve.open("{}/DSAE_{}/ae_model.shelve".format(in_dir,name),writeback=True) | |
| 113 | +# | |
| 114 | +json.dump(params, | |
| 115 | + open("{}/DSAE_{}/ae_model.json".format(in_dir,name),"w"), | |
| 116 | + indent=4) | |
| 117 | + | |
| 118 | +keys = ["ASR","TRS"] | |
| 119 | + | |
| 120 | + | |
| 121 | + | |
| 122 | +db["DSAE"] = {} | |
| 123 | + | |
| 124 | +db["DSAEFT"] = {} | |
| 125 | +mod = "ASR" | |
| 126 | +res_tuple_ASR = train_ae(infer_model["LDA"][mod]["TRAIN"], | |
| 127 | + infer_model["LDA"][mod]["DEV"], | |
| 128 | + infer_model["LDA"][mod]["TEST"], | |
| 129 | + hidden_size,dropouts=do_do, | |
| 130 | + patience = patience,sgd=sgd, | |
| 131 | + input_activation=input_activation, | |
| 132 | + output_activation=output_activation,loss=loss,epochs=epochs, | |
| 133 | + batch_size=batch_size,verbose=0,get_weights=True) | |
| 134 | +mlp_res_list = [] | |
| 135 | +for layer in res_tuple_ASR[0]: | |
| 136 | + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], | |
| 137 | + layer[1],infer_model["LABEL"][mod]["DEV"], | |
| 138 | + layer[2],infer_model["LABEL"][mod]["TEST"], | |
| 139 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, | |
| 140 | + sgd=mlp_sgd,epochs=mlp_epochs, | |
| 141 | + output_activation=mlp_output_activation, | |
| 142 | + input_activation=mlp_input_activation, | |
| 143 | + batch_size=mlp_batch_size,fit_verbose=0)) | |
| 144 | + | |
| 145 | +db["DSAE"][mod] = mlp_res_list | |
| 146 | +mod = "TRS" | |
| 147 | +print hidden_size | |
| 148 | +res_tuple_TRS = train_ae(infer_model["LDA"][mod]["TRAIN"], | |
| 149 | + infer_model["LDA"][mod]["DEV"], | |
| 150 | + infer_model["LDA"][mod]["TEST"], | |
| 151 | + hidden_size,dropouts=do_do, | |
| 152 | + sgd=sgd,input_activation=input_activation, | |
| 153 | + output_activation=output_activation,loss=loss,epochs=epochs, | |
| 154 | + batch_size=batch_size,patience=patience, | |
| 155 | + verbose=0,get_weights=True) | |
| 156 | + | |
| 157 | +mlp_res_list = [] | |
| 158 | +for layer in res_tuple_TRS[0]: | |
| 159 | + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], | |
| 160 | + layer[1],infer_model["LABEL"][mod]["DEV"], | |
| 161 | + layer[2],infer_model["LABEL"][mod]["TEST"], | |
| 162 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, | |
| 163 | + sgd=mlp_sgd,epochs=mlp_epochs, | |
| 164 | + output_activation=mlp_output_activation, | |
| 165 | + input_activation=mlp_input_activation, | |
| 166 | + batch_size=mlp_batch_size,fit_verbose=0)) | |
| 167 | + | |
| 168 | +db["DSAE"][mod] = mlp_res_list | |
| 169 | + | |
| 170 | + | |
| 171 | + | |
| 172 | +transfert = [] | |
| 173 | + | |
| 174 | +print " get weight trans" | |
| 175 | + | |
| 176 | +for asr_pred, trs_pred in zip(res_tuple_ASR[0], res_tuple_TRS[0]): | |
| 177 | + print "ASR", [ x.shape for x in asr_pred] | |
| 178 | + | |
| 179 | + print "TRS", [ x.shape for x in trs_pred] | |
| 180 | ||
| 181 | + | |
| 182 | +for asr_pred, trs_pred in zip(res_tuple_ASR[0], res_tuple_TRS[0]): | |
| 183 | + print "ASR", [ x.shape for x in asr_pred] | |
| 184 | + | |
| 185 | + print "TRS", [ x.shape for x in trs_pred] | |
| 186 | + transfert.append( train_ae(asr_pred[0], | |
| 187 | + asr_pred[1], | |
| 188 | + asr_pred[2], | |
| 189 | + trans_hidden_size, | |
| 190 | + dropouts=trans_do, | |
| 191 | + y_train = trs_pred[0], | |
| 192 | + y_dev=trs_pred[1], | |
| 193 | + y_test = trs_pred[2], | |
| 194 | + patience = trans_patience,sgd=trans_sgd, | |
| 195 | + input_activation=trans_input_activation, | |
| 196 | + output_activation=trans_output_activation, | |
| 197 | + loss=trans_loss, | |
| 198 | + epochs=trans_epochs, | |
| 199 | + batch_size=trans_batch_size,verbose=0,get_weights=True) ) | |
| 200 | +mod = "ASR" | |
| 201 | +mlp_res_bylvl = [] | |
| 202 | +print " MLP on transfert " | |
| 203 | +for level, w in transfert : | |
| 204 | + mlp_res_list = [] | |
| 205 | + for layer in level : | |
| 206 | + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], | |
| 207 | + layer[1],infer_model["LABEL"][mod]["DEV"], | |
| 208 | + layer[2],infer_model["LABEL"][mod]["TEST"], | |
| 209 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, | |
| 210 | + sgd=mlp_sgd,epochs=mlp_epochs, | |
| 211 | + output_activation=mlp_output_activation, | |
| 212 | + input_activation=mlp_input_activation, | |
| 213 | + batch_size=mlp_batch_size,fit_verbose=0)) | |
| 214 | + mlp_res_bylvl.append(mlp_res_list) | |
| 215 | +db["DSAE"]["transfert"] = mlp_res_bylvl | |
| 216 | + | |
| 217 | + | |
| 218 | +print " FT " | |
| 219 | +WA = res_tuple_ASR[1] | |
| 220 | +print "WA", len(WA), [ len(x) for x in WA] | |
| 221 | +WT = res_tuple_TRS[1] | |
| 222 | + | |
| 223 | +print "WT", len(WT), [ len(x) for x in WT] | |
| 224 | +Wtr = [ x[1] for x in transfert] | |
| 225 | + | |
| 226 | +print "Wtr", len(Wtr), [ len(x) for x in Wtr],[ len(x[1]) for x in Wtr] | |
| 227 | + | |
| 228 | +ft_res = ft_dsae(infer_model["LDA"]["ASR"]["TRAIN"], | |
| 229 | + infer_model["LDA"]["ASR"]["DEV"], | |
| 230 | + infer_model["LDA"]["ASR"]["TEST"], | |
| 231 | + y_train=infer_model["LDA"]["TRS"]["TRAIN"], | |
| 232 | + y_dev=infer_model["LDA"]["TRS"]["DEV"], | |
| 233 | + y_test=infer_model["LDA"]["TRS"]["TEST"], | |
| 234 | + ae_hidden = hidden_size, | |
| 235 | + transfer_hidden = trans_hidden_size, | |
| 236 | + start_weights = WA, | |
| 237 | + transfer_weights = Wtr, | |
| 238 | + end_weights = WT, | |
| 239 | + input_activation = input_activation, | |
| 240 | + output_activation = output_activation, | |
| 241 | + ae_dropouts= do_do, | |
| 242 | + transfer_do = trans_do, | |
| 243 | + sgd = sgd, | |
| 244 | + loss = loss , | |
| 245 | + patience = patience, | |
| 246 | + batch_size = batch_size, | |
| 247 | + epochs= epochs) | |
| 248 | +mlps_by_lvls= [] | |
| 249 | +for level in ft_res : | |
| 250 | + mlp_res_list = [] | |
| 251 | + for layer in level : | |
| 252 | + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], | |
| 253 | + layer[1],infer_model["LABEL"][mod]["DEV"], | |
| 254 | + layer[2],infer_model["LABEL"][mod]["TEST"], | |
| 255 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, | |
| 256 | + sgd=mlp_sgd,epochs=mlp_epochs, | |
| 257 | + output_activation=mlp_output_activation, | |
| 258 | + input_activation=mlp_input_activation, | |
| 259 | + batch_size=mlp_batch_size,fit_verbose=0)) | |
| 260 | + mlps_by_lvls.append(mlp_res_list) | |
| 261 | + | |
| 262 | + | |
| 263 | +db["DSAEFT"]["transfert"] = mlps_by_lvls | |
| 264 | + | |
| 265 | +db.close() |
LDA/04e-mm_vae.py
| 1 | + | |
| 2 | +# coding: utf-8 | |
| 3 | + | |
| 4 | +# In[2]: | |
| 5 | + | |
| 6 | +# Import | |
| 7 | +import gensim | |
| 8 | +from scipy import sparse | |
| 9 | +import itertools | |
| 10 | +from sklearn import preprocessing | |
| 11 | +from keras.models import Sequential | |
| 12 | +from keras.optimizers import SGD,Adam | |
| 13 | +from mlp import * | |
| 14 | +from vae import * | |
| 15 | +import sklearn.metrics | |
| 16 | +import shelve | |
| 17 | +import pickle | |
| 18 | +from utils import * | |
| 19 | +import sys | |
| 20 | +import os | |
| 21 | +import json | |
| 22 | +# In[4]: | |
| 23 | + | |
| 24 | +infer_model=shelve.open("{}".format(sys.argv[2])) | |
| 25 | +in_dir = sys.argv[1] | |
| 26 | +#['ASR', 'TRS', 'LABEL'] | |
| 27 | +# In[6]: | |
| 28 | + | |
| 29 | + | |
| 30 | +hidden_size= [60] | |
| 31 | +input_activation="tanh" | |
| 32 | +output_activation="sigmoid" | |
| 33 | +epochs=300 | |
| 34 | +batch=1 | |
| 35 | +patience=60 | |
| 36 | +sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | |
| 37 | +latent_dim = 30 | |
| 38 | + | |
| 39 | + | |
| 40 | + | |
| 41 | +mlp_h = [ 256 ] | |
| 42 | +mlp_loss = "categorical_crossentropy" | |
| 43 | +mlp_dropouts = [] | |
| 44 | +mlp_sgd = Adam(lr=0.001) | |
| 45 | +mlp_epochs = 1000 | |
| 46 | +mlp_batch_size = 16 | |
| 47 | +mlp_output_activation="softmax" | |
| 48 | + | |
| 49 | +try : | |
| 50 | + sgd_repr=sgd.get_config()["name"] | |
| 51 | +except AttributeError : | |
| 52 | + sgd_repr=sgd | |
| 53 | + | |
| 54 | +try : | |
| 55 | + mlp_sgd_repr=mlp_sgd.get_config()["name"] | |
| 56 | +except AttributeError : | |
| 57 | + mlp_sgd_repr=mlp_sgd | |
| 58 | + | |
| 59 | + | |
| 60 | +params={ "h1" : "_".join([ str(x) for x in hidden_size ]), | |
| 61 | + "inside_activation" : input_activation, | |
| 62 | + "output_activation" : output_activation, | |
| 63 | + "epochs" : epochs , | |
| 64 | + "batch_size" : batch, | |
| 65 | + "patience" : patience, | |
| 66 | + "sgd" : sgd_repr, | |
| 67 | + "mlp_h ": "_".join([str(x) for x in mlp_h]), | |
| 68 | + "mlp_loss ": mlp_loss, | |
| 69 | + "mlp_dropouts ": "_".join([str(x) for x in mlp_dropouts]), | |
| 70 | + "mlp_sgd ": mlp_sgd_repr, | |
| 71 | + "mlp_epochs ": mlp_epochs, | |
| 72 | + "mlp_batch_size ": mlp_batch_size, | |
| 73 | + "mlp_output" : mlp_output_activation | |
| 74 | + } | |
| 75 | +name = "_".join([ str(x) for x in params.values()]) | |
| 76 | +try: | |
| 77 | + os.mkdir("{}/VAE_{}".format(in_dir,name)) | |
| 78 | +except: | |
| 79 | + pass | |
| 80 | +db = shelve.open("{}/VAE_{}/ae_model.shelve".format(in_dir,name),writeback=True) | |
| 81 | +db["params"] = params | |
| 82 | +db["LABEL"]=infer_model["LABEL"] | |
| 83 | +# | |
| 84 | +json.dump(params, | |
| 85 | + open("{}/VAE_{}/ae_model.json".format(in_dir,name),"w"), | |
| 86 | + indent=4) | |
| 87 | + | |
| 88 | +keys = ["ASR","TRS"] | |
| 89 | + | |
| 90 | +db["VAE"] = {} | |
| 91 | +db["LDA"] = {} | |
| 92 | +for mod in keys : | |
| 93 | + print mod | |
| 94 | + db["LDA"][mod] = train_mlp(infer_model["LDA"][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"], | |
| 95 | + infer_model["LDA"][mod]["DEV"],infer_model["LABEL"][mod]["DEV"], | |
| 96 | + infer_model["LDA"][mod]["TEST"],infer_model["LABEL"][mod]["TEST"], | |
| 97 | + mlp_h ,sgd=mlp_sgd, | |
| 98 | + epochs=mlp_epochs, | |
| 99 | + batch_size=mlp_batch_size, | |
| 100 | + input_activation=input_activation, | |
| 101 | + output_activation=mlp_output_activation, | |
| 102 | + dropouts=mlp_dropouts, | |
| 103 | + fit_verbose=0) | |
| 104 | + | |
| 105 | + res=train_vae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"], | |
| 106 | + hidden_size=hidden_size[0], | |
| 107 | + latent_dim=latent_dim,sgd=sgd, | |
| 108 | + input_activation=input_activation,output_activation=output_activation, | |
| 109 | + nb_epochs=epochs,batch_size=batch) | |
| 110 | + mlp_res_list=[] | |
| 111 | + for layer in res : | |
| 112 | + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], | |
| 113 | + layer[1],infer_model["LABEL"][mod]["DEV"], | |
| 114 | + layer[2],infer_model["LABEL"][mod]["TEST"], | |
| 115 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, | |
| 116 | + output_activation=mlp_output_activation, | |
| 117 | + input_activation=input_activation, | |
| 118 | + batch_size=mlp_batch_size,fit_verbose=0)) | |
| 119 | + db["VAE"][mod]=mlp_res_list | |
| 120 | + | |
| 121 | +mod = "ASR" | |
| 122 | +mod2= "TRS" | |
| 123 | +mlp_res_list=[] | |
| 124 | + | |
| 125 | +res = train_vae(infer_model["LDA"][mod]["TRAIN"], | |
| 126 | + infer_model["LDA"][mod]["DEV"], | |
| 127 | + infer_model["LDA"][mod]["TEST"], | |
| 128 | + hidden_size=hidden_size[0], | |
| 129 | + sgd=sgd,input_activation=input_activation,output_activation=output_activation, | |
| 130 | + latent_dim=latent_dim, | |
| 131 | + nb_epochs=epochs, | |
| 132 | + batch_size=batch, | |
| 133 | + y_train=infer_model["LDA"][mod2]["TRAIN"], | |
| 134 | + y_dev=infer_model["LDA"][mod2]["DEV"], | |
| 135 | + y_test=infer_model["LDA"][mod2]["TEST"]) | |
| 136 | + | |
| 137 | +for layer in res : | |
| 138 | + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], | |
| 139 | + layer[1],infer_model["LABEL"][mod]["DEV"], | |
| 140 | + layer[2],infer_model["LABEL"][mod]["TEST"], | |
| 141 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, | |
| 142 | + output_activation=mlp_output_activation, | |
| 143 | + input_activation=input_activation, | |
| 144 | + batch_size=mlp_batch_size,fit_verbose=0)) | |
| 145 | + | |
| 146 | +db["VAE"]["SPE"] = mlp_res_list | |
| 147 | + | |
| 148 | +db.sync() | |
| 149 | +db.close() |
LDA/05-mmf_getscore.py
| 1 | +import numpy as np | |
| 2 | +import shelve | |
| 3 | +import sys | |
| 4 | +import glob | |
| 5 | +from collections import defaultdict | |
| 6 | +from tinydb import TinyDB, Query | |
| 7 | +from mako.template import Template | |
| 8 | +import time | |
| 9 | + | |
| 10 | +def get_best(x): | |
| 11 | + argbest=np.argmax(x[1]) | |
| 12 | + maxdev=x[1][argbest] | |
| 13 | + maxtrain=np.max(x[0]) | |
| 14 | + maxtest=np.max(x[2]) | |
| 15 | + besttest=x[2][argbest] | |
| 16 | + return ( maxtrain,maxdev,maxtest,besttest) | |
| 17 | +depth = lambda L: isinstance(L, list) and max(map(depth, L))+1 | |
| 18 | + | |
| 19 | + | |
| 20 | +template_name = ''' | |
| 21 | +${name} | |
| 22 | +======================== | |
| 23 | + | |
| 24 | +MLP scores : | |
| 25 | +------------------- | |
| 26 | +''' | |
| 27 | +template_value='''\n\n | |
| 28 | +| ${model} ${ttype} | train | dev |max test| best test| | |
| 29 | +| -------------------:|:--------:|:---------:|:------:|:--------:| | |
| 30 | +% for cpt,line in enumerate(models[model][ttype]): | |
| 31 | +| ${cpt} | ${line[0]} | ${line[1]} |${line[2]} | ${line[3]} | | |
| 32 | +% endfor | |
| 33 | +\n | |
| 34 | +''' | |
| 35 | + | |
| 36 | +# ae_model.shelve | |
| 37 | +def get_folder_file(x): | |
| 38 | + folder=x.split("/")[1] | |
| 39 | + shelve_file = ".".join(x.split(".")[:-1]) | |
| 40 | + return(folder,shelve_file) | |
| 41 | + | |
| 42 | +in_folder = sys.argv[1] | |
| 43 | + | |
| 44 | + | |
| 45 | +models = defaultdict(dict) | |
| 46 | + | |
| 47 | +ae_model_list = glob.glob("{}/*/ae_model.shelve.dir".format(in_folder)) | |
| 48 | +ae_model_list = sorted(ae_model_list) | |
| 49 | +ae_model_list= map(get_folder_file,ae_model_list) | |
| 50 | +for name , shelve_file in ae_model_list : | |
| 51 | + print Template(template_name).render(name=name) | |
| 52 | + opened_shelve = shelve.open(shelve_file) | |
| 53 | + keys = opened_shelve.keys() | |
| 54 | + if "LABEL" in keys : | |
| 55 | + keys.remove("LABEL") | |
| 56 | + if "params" in keys: | |
| 57 | + keys.remove("params") | |
| 58 | + to_print = [] | |
| 59 | + for working_key in keys: | |
| 60 | + for key in opened_shelve[working_key].keys(): | |
| 61 | + table_depth = depth(opened_shelve[working_key][key]) | |
| 62 | + if table_depth == 3 : | |
| 63 | + models[working_key][key] = [ get_best(x) for x in opened_shelve[working_key][key] ] | |
| 64 | + to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip()) | |
| 65 | + elif table_depth == 2 : | |
| 66 | + models[working_key][key] = [ get_best(opened_shelve[working_key][key]) ] | |
| 67 | + to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip()) | |
| 68 | + elif table_depth == 4 : | |
| 69 | + for layer in opened_shelve[working_key][key] : | |
| 70 | + models[working_key][key] = [ get_best(x) for x in layer ] | |
| 71 | + to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip()) | |
| 72 | + print "\n".join(to_print) |
LDA/run2.sh
| 1 | 1 | #python 00-prepross.py |
| 2 | -python 02-lda-order.py DECODA_list_wid.shelve output_v5/perplex.db 50 10 output_v5 50_500 0 1_0.1 1_0.1 500_1000 100_2000 | |
| 2 | +python 02-lda.py DECODA_list_wid.shelve output_v5/perplex.db 50 10 output_v5 50_500 0 1_0.1 1_0.1 500_1000 100_2000 | |
| 3 | 3 | #python 03-perplex.py DECODA_list_wid.shelve output_v5 output_v5/perplex.db |
| 4 | 4 | python 03-order_by_perp.py output_v5/perplex.db output_v5 |
| 5 | 5 | bash 04-run_mlp_ae.sh output_v5 DECODA_list_wid.shelve |
LDA/utils.py
LDA/vae.py
| 1 | +'''This script demonstrates how to build a variational autoencoder with Keras. | |
| 2 | +Reference: "Auto-Encoding Variational Bayes" https://arxiv.org/abs/1312.6114 | |
| 3 | +''' | |
| 4 | + | |
| 5 | +import itertools | |
| 6 | +import sys | |
| 7 | +import json | |
| 8 | + | |
| 9 | +import numpy as np | |
| 10 | +import matplotlib.pyplot as plt | |
| 11 | +from scipy import sparse | |
| 12 | +import scipy.io | |
| 13 | + | |
| 14 | +from keras.layers import Input, Dense, Lambda | |
| 15 | +from keras.models import Model | |
| 16 | +from keras import backend as K | |
| 17 | +from keras import objectives | |
| 18 | +from keras.datasets import mnist | |
| 19 | + | |
| 20 | +import pandas | |
| 21 | +import shelve | |
| 22 | +import pickle | |
| 23 | + | |
| 24 | + | |
| 25 | + | |
| 26 | + | |
| 27 | + | |
| 28 | +#batch_size = 16 | |
| 29 | +#original_dim = 784 | |
| 30 | +#latent_dim = 2 | |
| 31 | +#intermediate_dim = 128 | |
| 32 | +#epsilon_std = 0.01 | |
| 33 | +#nb_epoch = 40 | |
| 34 | + | |
| 35 | + | |
| 36 | + | |
| 37 | + | |
| 38 | +def train_vae(x_train,x_dev,x_test,y_train=None,y_dev=None,y_test=None,hidden_size=80,latent_dim=12,batch_size=8,nb_epochs=10,sgd="rmsprop",input_activation = "relu",output_activation = "sigmoid",epsilon_std=0.01): | |
| 39 | + | |
| 40 | + | |
| 41 | + | |
| 42 | + def sampling(args): | |
| 43 | + z_mean, z_log_std = args | |
| 44 | + epsilon = K.random_normal(shape=(batch_size, latent_dim), | |
| 45 | + mean=0., std=epsilon_std) | |
| 46 | + return z_mean + K.exp(z_log_std) * epsilon | |
| 47 | + | |
| 48 | + def vae_loss(x, x_decoded_mean): | |
| 49 | + xent_loss = objectives.binary_crossentropy(x, x_decoded_mean) | |
| 50 | + kl_loss = - 0.5 * K.mean(1 + z_log_std - K.square(z_mean) - K.exp(z_log_std), axis=-1) | |
| 51 | + return xent_loss + kl_loss | |
| 52 | + | |
| 53 | + original_dim = x_train.shape[1] | |
| 54 | + | |
| 55 | + | |
| 56 | + x = Input(batch_shape=(batch_size, original_dim)) | |
| 57 | + h = Dense(hidden_size, activation=input_activation)(x) | |
| 58 | + z_mean = Dense(latent_dim)(h) | |
| 59 | + z_log_std = Dense(latent_dim)(h) | |
| 60 | + | |
| 61 | + | |
| 62 | + # note that "output_shape" isn't necessary with the TensorFlow backend | |
| 63 | + # so you could write `Lambda(sampling)([z_mean, z_log_std])` | |
| 64 | + z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_std]) | |
| 65 | + | |
| 66 | + # we instantiate these layers separately so as to reuse them later | |
| 67 | + decoder_h = Dense(hidden_size, activation=input_activation) | |
| 68 | + decoder_mean = Dense(original_dim, activation=output_activation) | |
| 69 | + h_decoded = decoder_h(z) | |
| 70 | + x_decoded_mean = decoder_mean(h_decoded) | |
| 71 | + | |
| 72 | + | |
| 73 | + vae = Model(x, x_decoded_mean) | |
| 74 | + vae.compile(optimizer=sgd, loss=vae_loss) | |
| 75 | + | |
| 76 | + # train the VAE on MNIST digits | |
| 77 | + if y_train is None or y_dev is None or y_test is None : | |
| 78 | + y_train = x_train | |
| 79 | + y_dev = x_dev | |
| 80 | + y_test = x_test | |
| 81 | + | |
| 82 | + vae.fit(x_train, y_train, | |
| 83 | + shuffle=True, | |
| 84 | + nb_epoch=nb_epochs, | |
| 85 | + batch_size=batch_size, | |
| 86 | + validation_data=(x_dev, y_dev)) | |
| 87 | + | |
| 88 | + # build a model to project inputs on the latent space | |
| 89 | + encoder = Model(x, z_mean) | |
| 90 | + pred_train = encoder.predict(x_train, batch_size=batch_size) | |
| 91 | + pred_dev = encoder.predict(x_dev, batch_size=batch_size) | |
| 92 | + pred_test = encoder.predict(x_test,batch_size=batch_size) | |
| 93 | + return [ [ pred_train, pred_dev, pred_test ] ] | |
| 94 | +# display a 2D plot of the digit classes in the latent space | |
| 95 | + #x_test_encoded = encoder.predict(x_test, batch_size=batch_size) | |
| 96 | + # build a digit generator that can sample from the learned distribution | |
| 97 | + #decoder_input = Input(shape=(latent_dim,)) | |
| 98 | + #_h_decoded = decoder_h(decoder_input) | |
| 99 | + #_x_decoded_mean = decoder_mean(_h_decoded) | |
| 100 | + #generator = Model(decoder_input, _x_decoded_mean) | |
| 101 | + #x_decoded = generator.predict(z_sample) | |
| 102 | + |