diff --git a/LDA/00-mmf_make_features.py b/LDA/00-mmf_make_features.py new file mode 100644 index 0000000..cc80b49 --- /dev/null +++ b/LDA/00-mmf_make_features.py @@ -0,0 +1,38 @@ +import sys +import os + +import pandas +import numpy +import shelve + +from sklearn.preprocessing import LabelBinarizer + +from utils import select_mmf as select + +input_dir = sys.argv[1] # Dossier de premire niveau contient ASR et TRS +level = sys.argv[2] # taille de LDA ( -5) voulu + +lb=LabelBinarizer() +#y_train=lb.fit_transform([utils.select(ligneid) for ligneid in origin_corps["LABEL"]["TRAIN"]]) + + +data = shelve.open("{}/mmf_{}.shelve".format(input_dir,level)) +data["LABEL"]= {"LDA":{}} +for mod in ["ASR", "TRS" ] + train = pandas.read_table("{}/{}/train_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) + dev = pandas.read_table("{}/{}/dev_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) + test = pandas.read_table("{}/{}/test_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) + + y_train = train.iloc[:,0].apply(select) + y_dev = dev.iloc[:,0].apply(select) + y_test = test.iloc[:,0].apply(select) + lb.fit(y_train) + data["LABEL"][mod]={"TRAIN":lb.transform(y_train),"DEV":lb.transform(y_dev), "TEST": lb.transform(y_test)} + + data["LDA"][mod]={} + data["LDA"][mod]["TRAIN"]=train.iloc[:,1:].values + data["LDA"][mod]["DEV"]=dev.iloc[:,1:].values + data["LDA"][mod]["TEST"]=test.iloc[:,1:].values + +data.sync() +data.close() diff --git a/LDA/02-lda.py b/LDA/02-lda.py index 482a214..a5e0c83 100644 --- a/LDA/02-lda.py +++ b/LDA/02-lda.py @@ -12,10 +12,11 @@ import logging import dill from tinydb import TinyDB, where, Query import time +from joblib import Parallel, delayed def calc_perp(models,train): - + stop_words=models[1] name = models[0] @@ -45,7 +46,8 @@ def calc_perp(models,train): def train_lda(out_dir,train,size,it,sw_size,alpha,eta,passes,chunk): name = "s{}_it{}_sw{}_a{}_e{}_p{}_c{}".format(size,it,sw_size,alpha,eta,passes,chunk) logging.warning(name) - if os.path.isfile(out_dir+"/"+name+".dill"): + deep_out_dir = out_dir+"/"+name + if os.path.isdir(deep_out_dir): logging.error(name+" already done") return logging.warning(name+" to be done") @@ -54,7 +56,6 @@ def train_lda(out_dir,train,size,it,sw_size,alpha,eta,passes,chunk): asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] stop_words=set(asr_sw) | set(trs_sw) - stop_words=[ x.strip() for x in open("french.txt").readlines() ] logging.warning("TRS to be done") @@ -68,19 +69,42 @@ def train_lda(out_dir,train,size,it,sw_size,alpha,eta,passes,chunk): asr_probs = [] for line in lda_asr.expElogbeta: nline = line / np.sum(line) - asr_probs.append( str(x) for x in nline) + asr_probs.append([ str(x) for x in nline]) trs_probs = [] for line in lda_trs.expElogbeta: nline = line / np.sum(line) - trs_probs.append( str(x) for x in nline) + trs_probs.append([str(x) for x in nline]) K = lda_asr.num_topics topicWordProbMat_asr = lda_asr.print_topics(K,10) K = lda_trs.num_topics topicWordProbMat_trs = lda_trs.print_topics(K,10) + os.mkdir(deep_out_dir) + dill.dump([x for x in stop_words],open(deep_out_dir+"/stopwords.dill","w")) + lda_asr.save(deep_out_dir+"/lda_asr.model") + lda_trs.save(deep_out_dir+"/lda_trs.model") + dill.dump([x for x in asr_probs],open(deep_out_dir+"/lda_asr_probs.dill","w")) + dill.dump([x for x in trs_probs],open(deep_out_dir+"/lda_trs_probs.dill","w")) + return [name, stop_words, lda_asr , asr_probs , topicWordProbMat_asr, lda_trs, trs_probs, topicWordProbMat_trs] +def train_one(name,train,s,i,sw,a,e,p,c): + st=time.time() + logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]])) + models = train_lda(name,train,s,i,sw,a,e,p,c) + if models: + m = calc_perp(models,train) + #dill.dump(models,open("{}/{}.dill".format(name,models[0]),"wb")) + else : + m = None + e = time.time() + logging.warning("fin en : {}".format(e-st)) + return m + + + + if __name__ == "__main__": logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) @@ -109,6 +133,8 @@ if __name__ == "__main__": db = TinyDB(db_path) nb_model = len(passes) * len(chunk) * len(it) * len(sw_size) * len(alpha) * len(eta) * len(size) logging.warning(" hey will train {} models ".format(nb_model)) + + args_list=[] for p in passes: for c in chunk: for i in it : @@ -116,12 +142,8 @@ if __name__ == "__main__": for a in alpha: for e in eta: for s in size: - st=time.time() - logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]])) - models = train_lda(name,train,s,i,sw,a,e,p,c) - if models: - m = calc_perp(models,train) - dill.dump(models,open("{}/{}.dill".format(name,models[0]),"wb")) - db.insert(m) - e = time.time() - logging.warning("fin en : {}".format(e-st)) + args_list.append((name,train,s,i,sw,a,e,p,c)) + res_list= Parallel(n_jobs=15)(delayed(train_one)(*args) for args in args_list) + for m in res_list : + db.insert(m) + diff --git a/LDA/03-mono_perplex.py b/LDA/03-mono_perplex.py index 983eb8f..25083d4 100644 --- a/LDA/03-mono_perplex.py +++ b/LDA/03-mono_perplex.py @@ -52,7 +52,7 @@ if __name__ == "__main__": input_dir = sys.argv[2] db_path = sys.argv[3] logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) - folders = glob.glob("{}/*".format(input_dir)) + folders = glob.glob("{}/s*".format(input_dir)) #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir))) train = shelve.open(input_shelve) diff --git a/LDA/03-perplex.py b/LDA/03-perplex.py index 21f3c4c..53be707 100644 --- a/LDA/03-perplex.py +++ b/LDA/03-perplex.py @@ -22,40 +22,43 @@ def grouper(n, iterable, fillvalue=None): def calc_perp(params): - in_dir,train = params - name = in_dir.split("/")[-1] - # s40_it1_sw50_a0.01_e0.1_p6_c1000 + try: + in_dir,train = params + name = in_dir.split("/")[-1] + # s40_it1_sw50_a0.01_e0.1_p6_c1000 - entry = Query() - value=db.search(entry.name == name) - if len(value) > 0 : - logging.warning("{} already done".format(name)) - return + entry = Query() + value=db.search(entry.name == name) + if len(value) > 0 : + logging.warning("{} already done".format(name)) + return - sw_size = int(name.split("_")[2][2:]) + sw_size = int(name.split("_")[2][2:]) - logging.warning(" go {} ".format(name)) + logging.warning(" go {} ".format(name)) - logging.warning("Redo Vocab and stop") - asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) - trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) - asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] - trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] - stop_words=set(asr_sw) | set(trs_sw) + logging.warning("Redo Vocab and stop") + asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) + trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) + asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] + trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] + stop_words=set(asr_sw) | set(trs_sw) - logging.warning("TRS to be done") - - dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] - lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir)) - perp_trs = lda_trs.log_perplexity(dev_trs) - logging.warning("ASR to be done") - dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] - lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir)) - perp_asr = lda_asr.log_perplexity(dev_asr) - logging.warning("ASR saving") - res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs} - return res_dict + logging.warning("TRS to be done") + + dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] + lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir)) + perp_trs = lda_trs.log_perplexity(dev_trs) + logging.warning("ASR to be done") + dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] + lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir)) + perp_asr = lda_asr.log_perplexity(dev_asr) + logging.warning("ASR saving") + res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs} + return res_dict + except : + return { "name" : name } if __name__ == "__main__": input_shelve = sys.argv[1] diff --git a/LDA/04a-mmdf.py b/LDA/04a-mmdf.py new file mode 100644 index 0000000..bb281e4 --- /dev/null +++ b/LDA/04a-mmdf.py @@ -0,0 +1,108 @@ + +# coding: utf-8 + +# In[29]: + +# Import +import itertools +import shelve +import pickle +import numpy +import scipy +from scipy import sparse +import scipy.sparse +import scipy.io +from mlp import * +import mlp +import sys +import utils +import dill +from collections import Counter +from gensim.models import LdaModel + + + +# In[3]: + +#30_50_50_150_0.0001 + +# In[4]: + +#db=shelve.open("SPELIKE_MLP_DB.shelve",writeback=True) +origin_corps=shelve.open("{}".format(sys.argv[2])) +in_dir = sys.argv[1] + + +out_db=shelve.open("{}/mlp_scores.shelve".format(in_dir),writeback=True) + +mlp_h = [ 250, 250 ] +mlp_loss = "categorical_crossentropy" +mlp_dropouts = [0.25]* len(mlp_h) +mlp_sgd = Adam(lr=0.0001) +mlp_epochs = 3000 +mlp_batch_size = 1 +mlp_input_activation = "relu" +mlp_output_activation="softmax" + +ress = [] +for key in ["TRS", "ASR"] : + + res=mlp.train_mlp(origin_corps["LDA"][key]["TRAIN"],origin_corps["LABEL"][key]["TRAIN"], + origin_corps["LDA"][key]["DEV"],origin_corps["LABEL"][key]["DEV"], + origin_corps["LDA"][key]["TEST"],origin_corps["LABEL"][key]["TEST"], + mlp_h,dropouts=mlp_dropouts,sgd=mlp_sgd, + epochs=mlp_epochs, + batch_size=mlp_batch_size, + save_pred=False,keep_histo=False, + loss="categorical_crossentropy",fit_verbose=0) + arg_best=[] + dev_best=[] + arg_best.append(numpy.argmax(res[1])) + dev_best.append(res[1][arg_best[-1]]) + res[1][arg_best[-1]]=0 + arg_best.append(numpy.argmax(res[1])) + dev_best.append(res[1][arg_best[-1]]) + res[1][arg_best[-1]]=0 + arg_best.append(numpy.argmax(res[1])) + dev_best.append(res[1][arg_best[-1]]) + res[1][arg_best[-1]]=0 + arg_best.append(numpy.argmax(res[1])) + dev_best.append(res[1][arg_best[-1]]) + res[1][arg_best[-1]]=0 + arg_best.append(numpy.argmax(res[1])) + dev_best.append(res[1][arg_best[-1]]) + res[1][arg_best[-1]]=0 + arg_best.append(numpy.argmax(res[1])) + dev_best.append(res[1][arg_best[-1]]) + res[1][arg_best[-1]]=0 + arg_best.append(numpy.argmax(res[1])) + dev_best.append(res[1][arg_best[-1]]) + res[1][arg_best[-1]]=0 + arg_best.append(numpy.argmax(res[1])) + dev_best.append(res[1][arg_best[-1]]) + res[1][arg_best[-1]]=0 + arg_best.append(numpy.argmax(res[1])) + dev_best.append(res[1][arg_best[-1]]) + res[1][arg_best[-1]]=0 + arg_best.append(numpy.argmax(res[1])) + dev_best.append(res[1][arg_best[-1]]) + res[1][arg_best[-1]]=0 + arg_best.append(numpy.argmax(res[1])) + dev_best.append(res[1][arg_best[-1]]) + res[1][arg_best[-1]]=0 + arg_best.append(numpy.argmax(res[1])) + dev_best.append(res[1][arg_best[-1]]) + res[1][arg_best[-1]]=0 + + + + + test_best =[ res[2][x] for x in arg_best ] + test_max = numpy.max(res[2]) + out_db[key]=(res,(dev_best,test_best,test_max)) + ress.append((key,dev_best,test_best,test_max)) + +for el in ress : + print el +out_db.close() +origin_corps.close() diff --git a/LDA/04b-mmf_mini_ae.py b/LDA/04b-mmf_mini_ae.py new file mode 100644 index 0000000..d2000f7 --- /dev/null +++ b/LDA/04b-mmf_mini_ae.py @@ -0,0 +1,148 @@ + +# coding: utf-8 + +# In[2]: + +# Import +import gensim +from scipy import sparse +import itertools +from sklearn import preprocessing +from keras.models import Sequential +from keras.optimizers import SGD,Adam +from mlp import * +import sklearn.metrics +import shelve +import pickle +from utils import * +import sys +import os +import json +# In[4]: + +infer_model=shelve.open("{}".format(sys.argv[2])) +in_dir = sys.argv[1] +#['ASR', 'TRS', 'LABEL'] +# In[6]: + + +hidden_size=[ 100 , 50, 100 ] +input_activation="tanh" +output_activation="tanh" +loss="mse" +epochs=1000 +batch=1 +patience=60 +do_do=[False] +sgd = Adam(lr=0.000001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) + + + +mlp_h = [ 150 ,150 ,150 ] +mlp_loss = "categorical_crossentropy" +mlp_dropouts = [] +mlp_sgd = Adam(lr=0.0001) +mlp_epochs = 2000 +mlp_batch_size = 8 +mlp_output_activation="softmax" + +try : + sgd_repr=sgd.get_config()["name"] +except AttributeError : + sgd_repr=sgd + +try : + mlp_sgd_repr=mlp_sgd.get_config()["name"] +except AttributeError : + mlp_sgd_repr=mlp_sgd + + +params={ "h1" : "_".join([ str(x) for x in hidden_size ]), + "inside_activation" : input_activation, + "output_activation" : output_activation, + "do_dropout": "_".join([str(x) for x in do_do]), + "loss" : loss, + "epochs" : epochs , + "batch_size" : batch, + "patience" : patience, + "sgd" : sgd_repr, + "mlp_h ": "_".join([str(x) for x in mlp_h]), + "mlp_loss ": mlp_loss, + "mlp_dropouts ": "_".join([str(x) for x in mlp_dropouts]), + "mlp_sgd ": mlp_sgd_repr, + "mlp_epochs ": mlp_epochs, + "mlp_batch_size ": mlp_batch_size, + "mlp_output" : mlp_output_activation + } +name = "_".join([ str(x) for x in params.values()]) +try: + os.mkdir("{}/{}".format(in_dir,name)) +except: + pass +db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True) +db["params"] = params +db["LABEL"]=infer_model["LABEL"] +# +json.dump(params, + open("{}/{}/ae_model.json".format(in_dir,name),"w"), + indent=4) + +keys = ["ASR","TRS"] + +db["AE"] = {} +db["LDA"] = {} +for mod in keys : + print mod + db["LDA"][mod] = train_mlp(infer_model["LDA"][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"], + infer_model["LDA"][mod]["DEV"],infer_model["LABEL"][mod]["DEV"], + infer_model["LDA"][mod]["TEST"],infer_model["LABEL"][mod]["TEST"], + mlp_h ,sgd=mlp_sgd, + epochs=mlp_epochs, + batch_size=mlp_batch_size, + input_activation=input_activation, + output_activation=mlp_output_activation, + dropouts=mlp_dropouts, + fit_verbose=0) + + res=train_ae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"], + hidden_size,patience = params["patience"],sgd=sgd, + dropouts=do_do,input_activation=input_activation,output_activation=output_activation, + loss=loss,epochs=epochs,batch_size=batch,verbose=0) + mlp_res_list=[] + for layer in res : + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], + layer[1],infer_model["LABEL"][mod]["DEV"], + layer[2],infer_model["LABEL"][mod]["TEST"], + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, + output_activation=mlp_output_activation, + input_activation=input_activation, + batch_size=mlp_batch_size,fit_verbose=0)) + db["AE"][mod]=mlp_res_list + +mod = "ASR" +mod2= "TRS" +mlp_res_list=[] + +res = train_ae(infer_model["LDA"][mod]["TRAIN"], + infer_model["LDA"][mod]["DEV"], + infer_model["LDA"][mod]["TEST"], + hidden_size,dropouts=do_do,patience = params["patience"], + sgd=sgd,input_activation=input_activation,output_activation=output_activation,loss=loss,epochs=epochs, + batch_size=batch, + y_train=infer_model["LDA"][mod]["TRAIN"], + y_dev=infer_model["LDA"][mod2]["DEV"], + y_test=infer_model["LDA"][mod2]["TEST"]) + +for layer in res : + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], + layer[1],infer_model["LABEL"][mod]["DEV"], + layer[2],infer_model["LABEL"][mod]["TEST"], + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, + output_activation=mlp_output_activation, + input_activation=input_activation, + batch_size=mlp_batch_size,fit_verbose=0)) + +db["AE"]["SPE"] = mlp_res_list + +db.sync() +db.close() diff --git a/LDA/04c-mmf_sae.py b/LDA/04c-mmf_sae.py new file mode 100644 index 0000000..6811118 --- /dev/null +++ b/LDA/04c-mmf_sae.py @@ -0,0 +1,128 @@ + +# coding: utf-8 + +# In[2]: + +# Import +import gensim +from scipy import sparse +import itertools +from sklearn import preprocessing +from keras.models import Sequential +from keras.optimizers import SGD,Adam +from mlp import * +import mlp +import sklearn.metrics +import shelve +import pickle +from utils import * +import sys +import os +import json +# In[4]: + +infer_model=shelve.open("{}".format(sys.argv[2])) +in_dir = sys.argv[1] +#['ASR', 'TRS', 'LABEL'] +# In[6]: + + +hidden_size=[ 100, 80, 50 , 20 ] +input_activation="relu" +output_activation="relu" +loss="mse" +epochs=3000 +batch=1 +patience=20 +do_do=[ 0 ] * len(hidden_size) +sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) +try : + sgd_repr=sgd.get_config()["name"] +except AttributeError : + sgd_repr=sgd + +params={ "h1" : "_".join([str(x) for x in hidden_size]), + "inside_activation" : input_activation, + "out_activation" : output_activation, + "do_dropout": "_".join([str(x) for x in do_do]), + "loss" : loss, + "epochs" : epochs , + "batch_size" : batch, + "patience" : patience, + "sgd" : sgd_repr} +name = "_".join([ str(x) for x in params.values()]) +try: + os.mkdir("{}/SAE_{}".format(in_dir,name)) +except: + pass +db = shelve.open("{}/SAE_{}/ae_model.shelve".format(in_dir,name),writeback=True) +# +json.dump(params, + open("{}/SAE_{}/ae_model.json".format(in_dir,name),"w"), + indent=4) + +keys = ["ASR","TRS"] + +mlp_h = [ 150 , 300 ] +mlp_loss ="categorical_crossentropy" +mlp_dropouts = [0,0,0,0] +mlp_sgd = Adam(0.001) +mlp_epochs = 2000 +mlp_batch_size = 8 + +db["SAE"] = {} + +db["SAEFT"] = {} +for mod in keys : + print "MODE ", mod + res_tuple=train_sae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"], + infer_model["LDA"][mod]["TEST"], + hidden_size,dropouts=do_do, + patience = params["patience"],sgd=sgd,input_activation="tanh", + output_activation="tanh",loss=loss,epochs=epochs, + batch_size=batch,verbose=0) + #print len(res), [len(x) for x in res[0]], [ len(x) for x in res[1]] + for name , levels in zip(["SAE","SAEFT"],res_tuple): + print "NAME", name + mlp_res_by_level = [] + for res in levels: + mlp_res_list=[] + for nb,layer in enumerate(res) : + print "layer NB",nb + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], + layer[1],infer_model["LABEL"][mod]["DEV"], + layer[2],infer_model["LABEL"][mod]["TEST"], + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, + sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size, + fit_verbose=0)) + mlp_res_by_level.append(mlp_res_list) + db[name][mod]=mlp_res_by_level + +mod = "ASR" +mod2= "TRS" +print "mode SPE " +res_tuple = train_sae(infer_model["LDA"][mod]["TRAIN"], + infer_model["LDA"][mod]["DEV"], + infer_model["LDA"][mod]["TEST"], + hidden_size,dropouts=[0],patience=params["patience"], + sgd=sgd,input_activation=input_activation,output_activation=input_activation, + loss=loss,epochs=epochs,batch_size=batch, + y_train=infer_model["LDA"][mod2]["TRAIN"], + y_dev=infer_model["LDA"][mod2]["DEV"], + y_test=infer_model["LDA"][mod2]["TEST"]) + +for name , levels in zip(["SAE","SAEFT"],res_tuple): + mlp_res_by_level = [] + for res in levels : + mlp_res_list=[] + for layer in res : + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], + layer[1],infer_model["LABEL"][mod]["DEV"],layer[2], + infer_model["LABEL"][mod]["TEST"], + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, + sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size, + fit_verbose=0)) + mlp_res_by_level.append(mlp_res_list) + db[name]["SPE"] = mlp_res_by_level + +db.close() diff --git a/LDA/04d-mmf_dsae.py b/LDA/04d-mmf_dsae.py new file mode 100644 index 0000000..f2a6e6f --- /dev/null +++ b/LDA/04d-mmf_dsae.py @@ -0,0 +1,265 @@ + +# coding: utf-8 + +# In[2]: + +# Import +import gensim +from scipy import sparse +import itertools +from sklearn import preprocessing +from keras.models import Sequential +from keras.optimizers import SGD,Adam +from mlp import * +import mlp +import sklearn.metrics +import shelve +import pickle +from utils import * +import sys +import os +import json +# In[4]: + +infer_model=shelve.open("{}".format(sys.argv[2])) +in_dir = sys.argv[1] +#['ASR', 'TRS', 'LABEL'] +# In[6]: + +# AE params +hidden_size=[ 100, 100 ] +input_activation="relu" +output_activation="relu" +loss="mse" +epochs= 1000 +batch_size=1 +patience=20 +do_do=[ 0.25 ] * len(hidden_size) +sgd = Adam(lr=0.00001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) +try : + sgd_repr=sgd.get_config()["name"] +except AttributeError : + sgd_repr=sgd + +# Transforme : +trans_hidden_size=[ 300 , 300 ] +trans_input_activation="relu" +trans_output_activation="relu" +trans_loss="mse" +trans_epochs=1000 +trans_batch_size=8 +trans_patience=20 +trans_do=[ 0.25 ] * len(trans_hidden_size) +trans_sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) +try : + trans_sgd_repr=trans_sgd.get_config()["name"] +except AttributeError : + trans_sgd_repr=trans_sgd + + + +ae={ "h1" : "_".join([str(x) for x in hidden_size]), + "inside_activation" : input_activation, + "out_activation" : output_activation, + "do_dropout": "_".join([str(x) for x in do_do]), + "loss" : loss, + "epochs" : epochs , + "batch_size" : batch_size, + "patience" : patience, + "sgd" : sgd_repr} +name = "_".join([ str(x) for x in ae.values()]) + +trans={ "h1" : "_".join([str(x) for x in trans_hidden_size]), + "inside_activation" : trans_input_activation, + "out_activation" : trans_output_activation, + "do_dropout": "_".join([str(x) for x in trans_do]), + "loss" : trans_loss, + "epochs" : trans_epochs , + "batch_size" : trans_batch_size, + "patience" : trans_patience, + "sgd" : trans_sgd_repr} + +mlp_h = [ 300 , 300 ] +mlp_loss ="categorical_crossentropy" +mlp_dropouts = [0,0,0,0] +mlp_sgd = Adam(0.0001) +mlp_epochs = 1000 +mlp_batch_size = 8 +mlp_input_activation = "relu" +mlp_output_activation = "softmax" + +try : + mlp_sgd_repr=mlp_sgd.get_config()["name"] +except AttributeError : + mlp_sgd_repr=mlp_sgd + + + +mlp={ "h1" : "_".join([str(x) for x in mlp_h ]), + "inside_activation" : mlp_input_activation, + "out_activation" : mlp_output_activation, + "do_dropout": "_".join([str(x) for x in mlp_dropouts]), + "loss" : mlp_loss, + "epochs" : mlp_epochs , + "batch_size" : mlp_batch_size, + "sgd" : mlp_sgd_repr} + +params = { "ae":ae, "trans":trans, "mlp":mlp} +try: + os.mkdir("{}/DSAE_{}".format(in_dir,name)) +except: + pass +db = shelve.open("{}/DSAE_{}/ae_model.shelve".format(in_dir,name),writeback=True) +# +json.dump(params, + open("{}/DSAE_{}/ae_model.json".format(in_dir,name),"w"), + indent=4) + +keys = ["ASR","TRS"] + + + +db["DSAE"] = {} + +db["DSAEFT"] = {} +mod = "ASR" +res_tuple_ASR = train_ae(infer_model["LDA"][mod]["TRAIN"], + infer_model["LDA"][mod]["DEV"], + infer_model["LDA"][mod]["TEST"], + hidden_size,dropouts=do_do, + patience = patience,sgd=sgd, + input_activation=input_activation, + output_activation=output_activation,loss=loss,epochs=epochs, + batch_size=batch_size,verbose=0,get_weights=True) +mlp_res_list = [] +for layer in res_tuple_ASR[0]: + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], + layer[1],infer_model["LABEL"][mod]["DEV"], + layer[2],infer_model["LABEL"][mod]["TEST"], + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, + sgd=mlp_sgd,epochs=mlp_epochs, + output_activation=mlp_output_activation, + input_activation=mlp_input_activation, + batch_size=mlp_batch_size,fit_verbose=0)) + +db["DSAE"][mod] = mlp_res_list +mod = "TRS" +print hidden_size +res_tuple_TRS = train_ae(infer_model["LDA"][mod]["TRAIN"], + infer_model["LDA"][mod]["DEV"], + infer_model["LDA"][mod]["TEST"], + hidden_size,dropouts=do_do, + sgd=sgd,input_activation=input_activation, + output_activation=output_activation,loss=loss,epochs=epochs, + batch_size=batch_size,patience=patience, + verbose=0,get_weights=True) + +mlp_res_list = [] +for layer in res_tuple_TRS[0]: + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], + layer[1],infer_model["LABEL"][mod]["DEV"], + layer[2],infer_model["LABEL"][mod]["TEST"], + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, + sgd=mlp_sgd,epochs=mlp_epochs, + output_activation=mlp_output_activation, + input_activation=mlp_input_activation, + batch_size=mlp_batch_size,fit_verbose=0)) + +db["DSAE"][mod] = mlp_res_list + + + +transfert = [] + +print " get weight trans" + +for asr_pred, trs_pred in zip(res_tuple_ASR[0], res_tuple_TRS[0]): + print "ASR", [ x.shape for x in asr_pred] + + print "TRS", [ x.shape for x in trs_pred] + print + +for asr_pred, trs_pred in zip(res_tuple_ASR[0], res_tuple_TRS[0]): + print "ASR", [ x.shape for x in asr_pred] + + print "TRS", [ x.shape for x in trs_pred] + transfert.append( train_ae(asr_pred[0], + asr_pred[1], + asr_pred[2], + trans_hidden_size, + dropouts=trans_do, + y_train = trs_pred[0], + y_dev=trs_pred[1], + y_test = trs_pred[2], + patience = trans_patience,sgd=trans_sgd, + input_activation=trans_input_activation, + output_activation=trans_output_activation, + loss=trans_loss, + epochs=trans_epochs, + batch_size=trans_batch_size,verbose=0,get_weights=True) ) +mod = "ASR" +mlp_res_bylvl = [] +print " MLP on transfert " +for level, w in transfert : + mlp_res_list = [] + for layer in level : + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], + layer[1],infer_model["LABEL"][mod]["DEV"], + layer[2],infer_model["LABEL"][mod]["TEST"], + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, + sgd=mlp_sgd,epochs=mlp_epochs, + output_activation=mlp_output_activation, + input_activation=mlp_input_activation, + batch_size=mlp_batch_size,fit_verbose=0)) + mlp_res_bylvl.append(mlp_res_list) +db["DSAE"]["transfert"] = mlp_res_bylvl + + +print " FT " +WA = res_tuple_ASR[1] +print "WA", len(WA), [ len(x) for x in WA] +WT = res_tuple_TRS[1] + +print "WT", len(WT), [ len(x) for x in WT] +Wtr = [ x[1] for x in transfert] + +print "Wtr", len(Wtr), [ len(x) for x in Wtr],[ len(x[1]) for x in Wtr] + +ft_res = ft_dsae(infer_model["LDA"]["ASR"]["TRAIN"], + infer_model["LDA"]["ASR"]["DEV"], + infer_model["LDA"]["ASR"]["TEST"], + y_train=infer_model["LDA"]["TRS"]["TRAIN"], + y_dev=infer_model["LDA"]["TRS"]["DEV"], + y_test=infer_model["LDA"]["TRS"]["TEST"], + ae_hidden = hidden_size, + transfer_hidden = trans_hidden_size, + start_weights = WA, + transfer_weights = Wtr, + end_weights = WT, + input_activation = input_activation, + output_activation = output_activation, + ae_dropouts= do_do, + transfer_do = trans_do, + sgd = sgd, + loss = loss , + patience = patience, + batch_size = batch_size, + epochs= epochs) +mlps_by_lvls= [] +for level in ft_res : + mlp_res_list = [] + for layer in level : + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], + layer[1],infer_model["LABEL"][mod]["DEV"], + layer[2],infer_model["LABEL"][mod]["TEST"], + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, + sgd=mlp_sgd,epochs=mlp_epochs, + output_activation=mlp_output_activation, + input_activation=mlp_input_activation, + batch_size=mlp_batch_size,fit_verbose=0)) + mlps_by_lvls.append(mlp_res_list) + + +db["DSAEFT"]["transfert"] = mlps_by_lvls + +db.close() diff --git a/LDA/04e-mm_vae.py b/LDA/04e-mm_vae.py new file mode 100644 index 0000000..4cbb650 --- /dev/null +++ b/LDA/04e-mm_vae.py @@ -0,0 +1,149 @@ + +# coding: utf-8 + +# In[2]: + +# Import +import gensim +from scipy import sparse +import itertools +from sklearn import preprocessing +from keras.models import Sequential +from keras.optimizers import SGD,Adam +from mlp import * +from vae import * +import sklearn.metrics +import shelve +import pickle +from utils import * +import sys +import os +import json +# In[4]: + +infer_model=shelve.open("{}".format(sys.argv[2])) +in_dir = sys.argv[1] +#['ASR', 'TRS', 'LABEL'] +# In[6]: + + +hidden_size= [60] +input_activation="tanh" +output_activation="sigmoid" +epochs=300 +batch=1 +patience=60 +sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) +latent_dim = 30 + + + +mlp_h = [ 256 ] +mlp_loss = "categorical_crossentropy" +mlp_dropouts = [] +mlp_sgd = Adam(lr=0.001) +mlp_epochs = 1000 +mlp_batch_size = 16 +mlp_output_activation="softmax" + +try : + sgd_repr=sgd.get_config()["name"] +except AttributeError : + sgd_repr=sgd + +try : + mlp_sgd_repr=mlp_sgd.get_config()["name"] +except AttributeError : + mlp_sgd_repr=mlp_sgd + + +params={ "h1" : "_".join([ str(x) for x in hidden_size ]), + "inside_activation" : input_activation, + "output_activation" : output_activation, + "epochs" : epochs , + "batch_size" : batch, + "patience" : patience, + "sgd" : sgd_repr, + "mlp_h ": "_".join([str(x) for x in mlp_h]), + "mlp_loss ": mlp_loss, + "mlp_dropouts ": "_".join([str(x) for x in mlp_dropouts]), + "mlp_sgd ": mlp_sgd_repr, + "mlp_epochs ": mlp_epochs, + "mlp_batch_size ": mlp_batch_size, + "mlp_output" : mlp_output_activation + } +name = "_".join([ str(x) for x in params.values()]) +try: + os.mkdir("{}/VAE_{}".format(in_dir,name)) +except: + pass +db = shelve.open("{}/VAE_{}/ae_model.shelve".format(in_dir,name),writeback=True) +db["params"] = params +db["LABEL"]=infer_model["LABEL"] +# +json.dump(params, + open("{}/VAE_{}/ae_model.json".format(in_dir,name),"w"), + indent=4) + +keys = ["ASR","TRS"] + +db["VAE"] = {} +db["LDA"] = {} +for mod in keys : + print mod + db["LDA"][mod] = train_mlp(infer_model["LDA"][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"], + infer_model["LDA"][mod]["DEV"],infer_model["LABEL"][mod]["DEV"], + infer_model["LDA"][mod]["TEST"],infer_model["LABEL"][mod]["TEST"], + mlp_h ,sgd=mlp_sgd, + epochs=mlp_epochs, + batch_size=mlp_batch_size, + input_activation=input_activation, + output_activation=mlp_output_activation, + dropouts=mlp_dropouts, + fit_verbose=0) + + res=train_vae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"], + hidden_size=hidden_size[0], + latent_dim=latent_dim,sgd=sgd, + input_activation=input_activation,output_activation=output_activation, + nb_epochs=epochs,batch_size=batch) + mlp_res_list=[] + for layer in res : + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], + layer[1],infer_model["LABEL"][mod]["DEV"], + layer[2],infer_model["LABEL"][mod]["TEST"], + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, + output_activation=mlp_output_activation, + input_activation=input_activation, + batch_size=mlp_batch_size,fit_verbose=0)) + db["VAE"][mod]=mlp_res_list + +mod = "ASR" +mod2= "TRS" +mlp_res_list=[] + +res = train_vae(infer_model["LDA"][mod]["TRAIN"], + infer_model["LDA"][mod]["DEV"], + infer_model["LDA"][mod]["TEST"], + hidden_size=hidden_size[0], + sgd=sgd,input_activation=input_activation,output_activation=output_activation, + latent_dim=latent_dim, + nb_epochs=epochs, + batch_size=batch, + y_train=infer_model["LDA"][mod2]["TRAIN"], + y_dev=infer_model["LDA"][mod2]["DEV"], + y_test=infer_model["LDA"][mod2]["TEST"]) + +for layer in res : + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], + layer[1],infer_model["LABEL"][mod]["DEV"], + layer[2],infer_model["LABEL"][mod]["TEST"], + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, + output_activation=mlp_output_activation, + input_activation=input_activation, + batch_size=mlp_batch_size,fit_verbose=0)) + +db["VAE"]["SPE"] = mlp_res_list + +db.sync() +db.close() diff --git a/LDA/05-mmf_getscore.py b/LDA/05-mmf_getscore.py new file mode 100644 index 0000000..e4ce576 --- /dev/null +++ b/LDA/05-mmf_getscore.py @@ -0,0 +1,73 @@ +import numpy as np +import shelve +import sys +import glob +from collections import defaultdict +from tinydb import TinyDB, Query +from mako.template import Template +import time + +def get_best(x): + argbest=np.argmax(x[1]) + maxdev=x[1][argbest] + maxtrain=np.max(x[0]) + maxtest=np.max(x[2]) + besttest=x[2][argbest] + return ( maxtrain,maxdev,maxtest,besttest) +depth = lambda L: isinstance(L, list) and max(map(depth, L))+1 + + +template_name = ''' +${name} +======================== + +MLP scores : +------------------- +''' +template_value='''\n\n +| ${model} ${ttype} | train | dev |max test| best test| +| -------------------:|:--------:|:---------:|:------:|:--------:| +% for cpt,line in enumerate(models[model][ttype]): +| ${cpt} | ${line[0]} | ${line[1]} |${line[2]} | ${line[3]} | +% endfor +\n +''' + +# ae_model.shelve +def get_folder_file(x): + folder=x.split("/")[1] + shelve_file = ".".join(x.split(".")[:-1]) + return(folder,shelve_file) + +in_folder = sys.argv[1] + + +models = defaultdict(dict) + +ae_model_list = glob.glob("{}/*/ae_model.shelve.dir".format(in_folder)) +ae_model_list = sorted(ae_model_list) +ae_model_list= map(get_folder_file,ae_model_list) +for name , shelve_file in ae_model_list : + print Template(template_name).render(name=name) + opened_shelve = shelve.open(shelve_file) + keys = opened_shelve.keys() + if "LABEL" in keys : + keys.remove("LABEL") + if "params" in keys: + keys.remove("params") + to_print = [] + for working_key in keys: + for key in opened_shelve[working_key].keys(): + table_depth = depth(opened_shelve[working_key][key]) + if table_depth == 3 : + models[working_key][key] = [ get_best(x) for x in opened_shelve[working_key][key] ] + to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip()) + elif table_depth == 2 : + models[working_key][key] = [ get_best(opened_shelve[working_key][key]) ] + to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip()) + elif table_depth == 4 : + for layer in opened_shelve[working_key][key] : + models[working_key][key] = [ get_best(x) for x in layer ] + to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip()) + print "\n".join(to_print) + diff --git a/LDA/run2.sh b/LDA/run2.sh index c217da8..dfd6e0b 100644 --- a/LDA/run2.sh +++ b/LDA/run2.sh @@ -1,5 +1,5 @@ #python 00-prepross.py -python 02-lda-order.py DECODA_list_wid.shelve output_v5/perplex.db 50 10 output_v5 50_500 0 1_0.1 1_0.1 500_1000 100_2000 +python 02-lda.py DECODA_list_wid.shelve output_v5/perplex.db 50 10 output_v5 50_500 0 1_0.1 1_0.1 500_1000 100_2000 #python 03-perplex.py DECODA_list_wid.shelve output_v5 output_v5/perplex.db python 03-order_by_perp.py output_v5/perplex.db output_v5 bash 04-run_mlp_ae.sh output_v5 DECODA_list_wid.shelve diff --git a/LDA/utils.py b/LDA/utils.py index 0541953..c901e37 100644 --- a/LDA/utils.py +++ b/LDA/utils.py @@ -39,3 +39,7 @@ def yield_corpus(df_list): raise def select(elm): return int(elm.split("_")[-1]) + + +def select_mmf(elm): + return int(elm.split("_")[0]) diff --git a/LDA/vae.py b/LDA/vae.py new file mode 100644 index 0000000..b846e53 --- /dev/null +++ b/LDA/vae.py @@ -0,0 +1,102 @@ +'''This script demonstrates how to build a variational autoencoder with Keras. +Reference: "Auto-Encoding Variational Bayes" https://arxiv.org/abs/1312.6114 +''' + +import itertools +import sys +import json + +import numpy as np +import matplotlib.pyplot as plt +from scipy import sparse +import scipy.io + +from keras.layers import Input, Dense, Lambda +from keras.models import Model +from keras import backend as K +from keras import objectives +from keras.datasets import mnist + +import pandas +import shelve +import pickle + + + + + +#batch_size = 16 +#original_dim = 784 +#latent_dim = 2 +#intermediate_dim = 128 +#epsilon_std = 0.01 +#nb_epoch = 40 + + + + +def train_vae(x_train,x_dev,x_test,y_train=None,y_dev=None,y_test=None,hidden_size=80,latent_dim=12,batch_size=8,nb_epochs=10,sgd="rmsprop",input_activation = "relu",output_activation = "sigmoid",epsilon_std=0.01): + + + + def sampling(args): + z_mean, z_log_std = args + epsilon = K.random_normal(shape=(batch_size, latent_dim), + mean=0., std=epsilon_std) + return z_mean + K.exp(z_log_std) * epsilon + + def vae_loss(x, x_decoded_mean): + xent_loss = objectives.binary_crossentropy(x, x_decoded_mean) + kl_loss = - 0.5 * K.mean(1 + z_log_std - K.square(z_mean) - K.exp(z_log_std), axis=-1) + return xent_loss + kl_loss + + original_dim = x_train.shape[1] + + + x = Input(batch_shape=(batch_size, original_dim)) + h = Dense(hidden_size, activation=input_activation)(x) + z_mean = Dense(latent_dim)(h) + z_log_std = Dense(latent_dim)(h) + + + # note that "output_shape" isn't necessary with the TensorFlow backend + # so you could write `Lambda(sampling)([z_mean, z_log_std])` + z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_std]) + + # we instantiate these layers separately so as to reuse them later + decoder_h = Dense(hidden_size, activation=input_activation) + decoder_mean = Dense(original_dim, activation=output_activation) + h_decoded = decoder_h(z) + x_decoded_mean = decoder_mean(h_decoded) + + + vae = Model(x, x_decoded_mean) + vae.compile(optimizer=sgd, loss=vae_loss) + + # train the VAE on MNIST digits + if y_train is None or y_dev is None or y_test is None : + y_train = x_train + y_dev = x_dev + y_test = x_test + + vae.fit(x_train, y_train, + shuffle=True, + nb_epoch=nb_epochs, + batch_size=batch_size, + validation_data=(x_dev, y_dev)) + + # build a model to project inputs on the latent space + encoder = Model(x, z_mean) + pred_train = encoder.predict(x_train, batch_size=batch_size) + pred_dev = encoder.predict(x_dev, batch_size=batch_size) + pred_test = encoder.predict(x_test,batch_size=batch_size) + return [ [ pred_train, pred_dev, pred_test ] ] +# display a 2D plot of the digit classes in the latent space + #x_test_encoded = encoder.predict(x_test, batch_size=batch_size) + # build a digit generator that can sample from the learned distribution + #decoder_input = Input(shape=(latent_dim,)) + #_h_decoded = decoder_h(decoder_input) + #_x_decoded_mean = decoder_mean(_h_decoded) + #generator = Model(decoder_input, _x_decoded_mean) + #x_decoded = generator.predict(z_sample) +