From 2af8e57f4e1ebcfdd5ba9d3e8963c4853e472982 Mon Sep 17 00:00:00 2001 From: Killian Date: Fri, 22 Jul 2016 11:10:31 +0200 Subject: [PATCH] change all --- LDA/04a-mmdf.py | 15 ++++--- LDA/04b-mmf_mini_ae.py | 111 ++++++++++++++++++++++++++++++++++--------------- LDA/04c-mmf_sae.py | 101 +++++++++++++++++++++++++++++--------------- LDA/04d-mmf_dsae.py | 28 +++++++------ LDA/04e-mm_vae.py | 104 +++++++++++++++++++++++++++++---------------- LDA/05-lts_scoring.py | 80 +++++++++++++++++++++++++++++++++++ LDA/mlp.py | 1 - LDA/utils.py | 59 ++++++++++++++++++++++++++ LDA/vae.py | 53 +++++++++++++++++++++-- 9 files changed, 428 insertions(+), 124 deletions(-) create mode 100644 LDA/05-lts_scoring.py diff --git a/LDA/04a-mmdf.py b/LDA/04a-mmdf.py index 8c49391..a891987 100644 --- a/LDA/04a-mmdf.py +++ b/LDA/04a-mmdf.py @@ -31,7 +31,10 @@ from gensim.models import LdaModel #db=shelve.open("SPELIKE_MLP_DB.shelve",writeback=True) origin_corps=shelve.open("{}".format(sys.argv[2])) in_dir = sys.argv[1] - +if len(sys.argv) > 3 : + features_key = sys.argv[3] +else : + features_key = "LDA" out_db=shelve.open("{}/mlp_scores.shelve".format(in_dir),writeback=True) @@ -40,16 +43,16 @@ mlp_loss = "categorical_crossentropy" mlp_dropouts = [0.25]* len(mlp_h) mlp_sgd = Adam(lr=0.0001) mlp_epochs = 3000 -mlp_batch_size = 1 +mlp_batch_size = 5 mlp_input_activation = "relu" mlp_output_activation="softmax" ress = [] -for key in ["TRS", "ASR"] : +for key in origin_corps["features_key"].keys() : - res=mlp.train_mlp(origin_corps["LDA"][key]["TRAIN"],origin_corps["LABEL"][key]["TRAIN"], - origin_corps["LDA"][key]["DEV"],origin_corps["LABEL"][key]["DEV"], - origin_corps["LDA"][key]["TEST"],origin_corps["LABEL"][key]["TEST"], + res=mlp.train_mlp(origin_corps[features_key][key]["TRAIN"],origin_corps["LABEL"][key]["TRAIN"], + origin_corps[features_key][key]["DEV"],origin_corps["LABEL"][key]["DEV"], + origin_corps[features_key][key]["TEST"],origin_corps["LABEL"][key]["TEST"], mlp_h,dropouts=mlp_dropouts,sgd=mlp_sgd, epochs=mlp_epochs, batch_size=mlp_batch_size, diff --git a/LDA/04b-mmf_mini_ae.py b/LDA/04b-mmf_mini_ae.py index dc52788..b500b0c 100644 --- a/LDA/04b-mmf_mini_ae.py +++ b/LDA/04b-mmf_mini_ae.py @@ -10,6 +10,7 @@ import itertools from sklearn import preprocessing from keras.models import Sequential from keras.optimizers import SGD,Adam +from keras.layers.advanced_activations import ELU,PReLU from mlp import * import sklearn.metrics import shelve @@ -24,12 +25,24 @@ infer_model=shelve.open("{}".format(sys.argv[2])) in_dir = sys.argv[1] #['ASR', 'TRS', 'LABEL'] # In[6]: - +if len(sys.argv) > 4 : + features_key = sys.argv[4] +else : + features_key = "LDA" +save_projection = True json_conf =json.load(open(sys.argv[3])) ae_conf = json_conf["ae"] hidden_size= ae_conf["hidden_size"] -input_activation=ae_conf["input_activation"] +input_activation = None +print ae_conf["input_activation"] +if ae_conf["input_activation"] == "elu": + print " ELU" + input_activation = PReLU() +else: + print " ELSE" + input_activation = ae_conf["input_activation"] +#input_activation=ae_conf["input_activation"] output_activation=ae_conf["output_activation"] loss=ae_conf["loss"] epochs=ae_conf["epochs"] @@ -72,14 +85,18 @@ except: db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True) db["LABEL"]=infer_model["LABEL"] # -keys = ["ASR","TRS"] +keys = infer_model[features_key].keys() db["AE"] = {} -db["LDA"] = {} +db[features_key] = {} for mod in keys : - db["LDA"][mod] = train_mlp(infer_model["LDA"][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"], - infer_model["LDA"][mod]["DEV"],infer_model["LABEL"][mod]["DEV"], - infer_model["LDA"][mod]["TEST"],infer_model["LABEL"][mod]["TEST"], + print infer_model[features_key][mod]["TRAIN"].shape + print infer_model[features_key][mod]["DEV"].shape + print infer_model[features_key][mod]["TEST"].shape + + db[features_key][mod] = train_mlp(infer_model[features_key][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"], + infer_model[features_key][mod]["DEV"],infer_model["LABEL"][mod]["DEV"], + infer_model[features_key][mod]["TEST"],infer_model["LABEL"][mod]["TEST"], mlp_h ,sgd=mlp_sgd, epochs=mlp_epochs, batch_size=mlp_batch_size, @@ -87,13 +104,25 @@ for mod in keys : output_activation=mlp_output_activation, dropouts=mlp_dropouts, fit_verbose=0) - - res=train_ae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"], + print input_activation + res=train_ae(infer_model[features_key][mod]["TRAIN"],infer_model[features_key][mod]["DEV"],infer_model[features_key][mod]["TEST"], hidden_size,patience = patience,sgd=sgd, dropouts=do_do,input_activation=input_activation,output_activation=output_activation, loss=loss,epochs=epochs,batch_size=batch,verbose=0) mlp_res_list=[] - for layer in res : + for nb,layer in enumerate(res) : + if save_projection: + pd = pandas.DataFrame(layer[0]) + col_count = (pd.sum(axis=0) != 0) + pd = pd.loc[:,col_count] + pd.to_hdf("{}/{}/AE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"TRAIN") + pd = pandas.DataFrame(layer[1]) + pd = pd.loc[:,col_count] + pd.to_hdf("{}/{}/AE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"DEV") + pd = pandas.DataFrame(layer[2]) + pd = pd.loc[:,col_count] + pd.to_hdf("{}/{}/AE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"TEST") + del pd mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], layer[1],infer_model["LABEL"][mod]["DEV"], layer[2],infer_model["LABEL"][mod]["TEST"], @@ -103,30 +132,44 @@ for mod in keys : batch_size=mlp_batch_size,fit_verbose=0)) db["AE"][mod]=mlp_res_list -mod = "ASR" -mod2= "TRS" -mlp_res_list=[] - -res = train_ae(infer_model["LDA"][mod]["TRAIN"], - infer_model["LDA"][mod]["DEV"], - infer_model["LDA"][mod]["TEST"], - hidden_size,dropouts=do_do,patience = patience, - sgd=sgd,input_activation=input_activation,output_activation=output_activation,loss=loss,epochs=epochs, - batch_size=batch, - y_train=infer_model["LDA"][mod]["TRAIN"], - y_dev=infer_model["LDA"][mod2]["DEV"], - y_test=infer_model["LDA"][mod2]["TEST"]) - -for layer in res : - mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], - layer[1],infer_model["LABEL"][mod]["DEV"], - layer[2],infer_model["LABEL"][mod]["TEST"], - mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, - output_activation=mlp_output_activation, - input_activation=input_activation, - batch_size=mlp_batch_size,fit_verbose=0)) - -db["AE"]["SPE"] = mlp_res_list +if "ASR" in keys and "TRS" in keys: + mod = "ASR" + mod2= "TRS" + mlp_res_list=[] + + res = train_ae(infer_model[features_key][mod]["TRAIN"], + infer_model[features_key][mod]["DEV"], + infer_model[features_key][mod]["TEST"], + hidden_size,dropouts=do_do,patience = patience, + sgd=sgd,input_activation=input_activation,output_activation=output_activation,loss=loss,epochs=epochs, + batch_size=batch, + y_train=infer_model[features_key][mod]["TRAIN"], + y_dev=infer_model[features_key][mod2]["DEV"], + y_test=infer_model[features_key][mod2]["TEST"]) + + for nb,layer in enumerate(res) : + if save_projection: + pd = pandas.DataFrame(layer[0]) + col_count= (pd.sum(axis=0) != 0) + pd = pd.loc[:,col_count] + pd.to_hdf("{}/{}/AE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"TRAIN") + pd = pandas.DataFrame(layer[1]) + pd = pd.loc[:,col_count] + pd.to_hdf("{}/{}/AE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"DEV") + pd = pandas.DataFrame(layer[2]) + pd = pd.loc[:,col_count] + pd.to_hdf("{}/{}/AE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"TEST") + del pd + + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], + layer[1],infer_model["LABEL"][mod]["DEV"], + layer[2],infer_model["LABEL"][mod]["TEST"], + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, + output_activation=mlp_output_activation, + input_activation=input_activation, + batch_size=mlp_batch_size,fit_verbose=0)) + + db["AE"]["SPE"] = mlp_res_list db.sync() db.close() diff --git a/LDA/04c-mmf_sae.py b/LDA/04c-mmf_sae.py index 1130cac..fc51a57 100644 --- a/LDA/04c-mmf_sae.py +++ b/LDA/04c-mmf_sae.py @@ -23,6 +23,11 @@ import json infer_model=shelve.open("{}".format(sys.argv[2])) in_dir = sys.argv[1] +if len(sys.argv) > 4 : + features_key = sys.argv[4] +else : + features_key = "LDA" +save_projection = True #['ASR', 'TRS', 'LABEL'] # In[6]: json_conf =json.load(open(sys.argv[3])) @@ -47,13 +52,13 @@ except : sgd = sae_conf["sgd"] name = json_conf["name"] +print name try: os.mkdir("{}/{}".format(in_dir,name)) except: pass db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True) # -keys = ["ASR","TRS"] mlp_conf = json_conf["mlp"] mlp_h = mlp_conf["hidden_size"] mlp_loss = mlp_conf["loss"] @@ -72,23 +77,38 @@ try: except : mlp_sgd = mlp_conf["sgd"] - +keys = infer_model[features_key].keys() db["SAE"] = {} db["SAEFT"] = {} for mod in keys : - res_tuple=train_sae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"], - infer_model["LDA"][mod]["TEST"], + res_tuple=train_sae(infer_model[features_key][mod]["TRAIN"],infer_model[features_key][mod]["DEV"], + infer_model[features_key][mod]["TEST"], hidden_size,dropouts=do_do, patience = "patience",sgd=sgd,input_activation="tanh", output_activation="tanh",loss=loss,epochs=epochs, batch_size=batch,verbose=0) #print len(res), [len(x) for x in res[0]], [ len(x) for x in res[1]] - for name , levels in zip(["SAE","SAEFT"],res_tuple): + for i, levels in zip(["SAE","SAEFT"],res_tuple): mlp_res_by_level = [] - for res in levels: + for lvl,res in enumerate(levels): mlp_res_list=[] for nb,layer in enumerate(res) : + if save_projection: + pd = pandas.DataFrame(layer[0]) + col_count= (pd.sum(axis=0) != 0) + pd = pd.loc[:,col_count] + hdffile = "{}/{}/{}_{}_{}_df.hdf".format(in_dir,name,i,lvl,nb,mod) + print hdffile + pd.to_hdf(hdffile,"TRAIN") + pd = pandas.DataFrame(layer[1]) + pd = pd.loc[:,col_count] + pd.to_hdf(hdffile,"DEV") + pd = pandas.DataFrame(layer[2]) + pd = pd.loc[:,col_count] + pd.to_hdf(hdffile,"TEST") + del pd + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], layer[1],infer_model["LABEL"][mod]["DEV"], layer[2],infer_model["LABEL"][mod]["TEST"], @@ -96,33 +116,48 @@ for mod in keys : sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size, fit_verbose=0)) mlp_res_by_level.append(mlp_res_list) - db[name][mod]=mlp_res_by_level - -mod = "ASR" -mod2= "TRS" -res_tuple = train_sae(infer_model["LDA"][mod]["TRAIN"], - infer_model["LDA"][mod]["DEV"], - infer_model["LDA"][mod]["TEST"], - hidden_size,dropouts=[0],patience="patience", - sgd=sgd,input_activation=input_activation,output_activation=input_activation, - loss=loss,epochs=epochs,batch_size=batch, - y_train=infer_model["LDA"][mod2]["TRAIN"], - y_dev=infer_model["LDA"][mod2]["DEV"], - y_test=infer_model["LDA"][mod2]["TEST"]) - -for name , levels in zip(["SAE","SAEFT"],res_tuple): - mlp_res_by_level = [] - for res in levels : - mlp_res_list=[] - for layer in res : - mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], - layer[1],infer_model["LABEL"][mod]["DEV"],layer[2], - infer_model["LABEL"][mod]["TEST"], - mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, - sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size, - fit_verbose=0)) - mlp_res_by_level.append(mlp_res_list) - db[name]["SPE"] = mlp_res_by_level + db[i][mod]=mlp_res_by_level + + +if "ASR" in keys and "TRS" in keys : + mod = "ASR" + mod2= "TRS" + res_tuple = train_sae(infer_model[features_key][mod]["TRAIN"], + infer_model[features_key][mod]["DEV"], + infer_model[features_key][mod]["TEST"], + hidden_size,dropouts=[0],patience="patience", + sgd=sgd,input_activation=input_activation,output_activation=input_activation, + loss=loss,epochs=epochs,batch_size=batch, + y_train=infer_model[features_key][mod2]["TRAIN"], + y_dev=infer_model[features_key][mod2]["DEV"], + y_test=infer_model[features_key][mod2]["TEST"]) + + for i , levels in zip(["SAE","SAEFT"],res_tuple): + mlp_res_by_level = [] + for lvl,res in enumerate(levels) : + mlp_res_list=[] + for nb,layer in enumerate(res) : + if save_projection: + pd = pandas.DataFrame(layer[0]) + col_count= (pd.sum(axis=0) != 0) + pd = pd.loc[:,col_count] + pd.to_hdf("{}/{}/{}_{}_{}_{}_df.hdf".format(in_dir,name,i,lvl,nb,"SPE"),"TRAIN") + pd = pandas.DataFrame(layer[1]) + pd = pd.loc[:,col_count] + pd.to_hdf("{}/{}/{}_{}_{}_{}_df.hdf".format(in_dir,name,i,lvl,nb,"SPE"),"DEV") + pd = pandas.DataFrame(layer[2]) + pd = pd.loc[:,col_count] + pd.to_hdf("{}/{}/{}_{}_{}_{}_df.hdf".format(in_dir,name,i,lvl,nb,"SPE"),"TEST") + del pd + + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], + layer[1],infer_model["LABEL"][mod]["DEV"],layer[2], + infer_model["LABEL"][mod]["TEST"], + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, + sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size, + fit_verbose=0)) + mlp_res_by_level.append(mlp_res_list) + db[i]["SPE"] = mlp_res_by_level db.sync() db.close() diff --git a/LDA/04d-mmf_dsae.py b/LDA/04d-mmf_dsae.py index d768f9b..401aa2d 100644 --- a/LDA/04d-mmf_dsae.py +++ b/LDA/04d-mmf_dsae.py @@ -26,6 +26,10 @@ infer_model=shelve.open("{}".format(sys.argv[2])) in_dir = sys.argv[1] #['ASR', 'TRS', 'LABEL'] # In[6]: +if len(sys.argv) > 4 : + features_key = sys.argv[4] +else : + features_key = "LDA" json_conf =json.load(open(sys.argv[3])) @@ -101,9 +105,9 @@ db["DSAE"] = {} db["DSAEFT"] = {} mod = "ASR" -res_tuple_ASR = train_ae(infer_model["LDA"][mod]["TRAIN"], - infer_model["LDA"][mod]["DEV"], - infer_model["LDA"][mod]["TEST"], +res_tuple_ASR = train_ae(infer_model[features_key][mod]["TRAIN"], + infer_model[features_key][mod]["DEV"], + infer_model[features_key][mod]["TEST"], hidden_size,dropouts=do_do, patience = patience,sgd=sgd, input_activation=input_activation, @@ -122,9 +126,9 @@ for layer in res_tuple_ASR[0]: db["DSAE"][mod] = mlp_res_list mod = "TRS" -res_tuple_TRS = train_ae(infer_model["LDA"][mod]["TRAIN"], - infer_model["LDA"][mod]["DEV"], - infer_model["LDA"][mod]["TEST"], +res_tuple_TRS = train_ae(infer_model[features_key][mod]["TRAIN"], + infer_model[features_key][mod]["DEV"], + infer_model[features_key][mod]["TEST"], hidden_size,dropouts=do_do, sgd=sgd,input_activation=input_activation, output_activation=output_activation,loss=loss,epochs=epochs, @@ -202,12 +206,12 @@ Wtr = [ x[1] for x in transfert] #print "Wtr", len(Wtr), [ len(x) for x in Wtr],[ len(x[1]) for x in Wtr] -ft_res = ft_dsae(infer_model["LDA"]["ASR"]["TRAIN"], - infer_model["LDA"]["ASR"]["DEV"], - infer_model["LDA"]["ASR"]["TEST"], - y_train=infer_model["LDA"]["TRS"]["TRAIN"], - y_dev=infer_model["LDA"]["TRS"]["DEV"], - y_test=infer_model["LDA"]["TRS"]["TEST"], +ft_res = ft_dsae(infer_model[features_key]["ASR"]["TRAIN"], + infer_model[features_key]["ASR"]["DEV"], + infer_model[features_key]["ASR"]["TEST"], + y_train=infer_model[features_key]["TRS"]["TRAIN"], + y_dev=infer_model[features_key]["TRS"]["DEV"], + y_test=infer_model[features_key]["TRS"]["TEST"], ae_hidden = hidden_size, transfer_hidden = trans_hidden_size, start_weights = WA, diff --git a/LDA/04e-mm_vae.py b/LDA/04e-mm_vae.py index 7818868..60ec159 100644 --- a/LDA/04e-mm_vae.py +++ b/LDA/04e-mm_vae.py @@ -21,7 +21,12 @@ infer_model=shelve.open("{}".format(sys.argv[2])) in_dir = sys.argv[1] #['ASR', 'TRS', 'LABEL'] # In[6]: +if len(sys.argv) > 4 : + features_key = sys.argv[4] +else : + features_key = "LDA" +save_projection = True json_conf =json.load(open(sys.argv[3])) vae_conf = json_conf["vae"] @@ -63,10 +68,11 @@ except: name = json_conf["name"] - -try: +try : + print "make folder " os.mkdir("{}/{}".format(in_dir,name)) except: + print "folder not maked" pass @@ -74,15 +80,16 @@ db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True) db["LABEL"]=infer_model["LABEL"] # -keys = ["ASR","TRS"] + +keys = infer_model[features_key].keys() db["VAE"] = {} -db["LDA"] = {} +db[features_key] = {} for mod in keys : #print mod - db["LDA"][mod] = train_mlp(infer_model["LDA"][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"], - infer_model["LDA"][mod]["DEV"],infer_model["LABEL"][mod]["DEV"], - infer_model["LDA"][mod]["TEST"],infer_model["LABEL"][mod]["TEST"], + db[features_key][mod] = train_mlp(infer_model[features_key][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"], + infer_model[features_key][mod]["DEV"],infer_model["LABEL"][mod]["DEV"], + infer_model[features_key][mod]["TEST"],infer_model["LABEL"][mod]["TEST"], mlp_h ,sgd=mlp_sgd, epochs=mlp_epochs, batch_size=mlp_batch_size, @@ -91,13 +98,26 @@ for mod in keys : dropouts=mlp_dropouts, fit_verbose=0) - res=train_vae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"], + res=train_vae(infer_model[features_key][mod]["TRAIN"],infer_model[features_key][mod]["DEV"],infer_model[features_key][mod]["TEST"], hidden_size=hidden_size[0], latent_dim=latent_dim,sgd=sgd, input_activation=input_activation,output_activation=output_activation, nb_epochs=epochs,batch_size=batch) mlp_res_list=[] - for layer in res : + for nb,layer in enumerate(res) : + if save_projection: + pd = pandas.DataFrame(layer[0]) + col_count = (pd.sum(axis=0) != 0) + pd = pd.loc[:,cyyol_count] + pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"TRAIN") + pd = pandas.DataFrame(layer[1]) + pd = pd.loc[:,col_count] + pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"DEV") + pd = pandas.DataFrame(layer[2]) + pd = pd.loc[:,col_count] + pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"TEST") + del pd + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], layer[1],infer_model["LABEL"][mod]["DEV"], layer[2],infer_model["LABEL"][mod]["TEST"], @@ -107,32 +127,46 @@ for mod in keys : batch_size=mlp_batch_size,fit_verbose=0)) db["VAE"][mod]=mlp_res_list -mod = "ASR" -mod2= "TRS" -mlp_res_list=[] - -res = train_vae(infer_model["LDA"][mod]["TRAIN"], - infer_model["LDA"][mod]["DEV"], - infer_model["LDA"][mod]["TEST"], - hidden_size=hidden_size[0], - sgd=sgd,input_activation=input_activation,output_activation=output_activation, - latent_dim=latent_dim, - nb_epochs=epochs, - batch_size=batch, - y_train=infer_model["LDA"][mod2]["TRAIN"], - y_dev=infer_model["LDA"][mod2]["DEV"], - y_test=infer_model["LDA"][mod2]["TEST"]) - -for layer in res : - mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], - layer[1],infer_model["LABEL"][mod]["DEV"], - layer[2],infer_model["LABEL"][mod]["TEST"], - mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, - output_activation=mlp_output_activation, - input_activation=input_activation, - batch_size=mlp_batch_size,fit_verbose=0)) - -db["VAE"]["SPE"] = mlp_res_list +if "ASR" in keys and "TRS" in keys : + mod = "ASR" + mod2= "TRS" + mlp_res_list=[] + + res = train_vae(infer_model[features_key][mod]["TRAIN"], + infer_model[features_key][mod]["DEV"], + infer_model[features_key][mod]["TEST"], + hidden_size=hidden_size[0], + sgd=sgd,input_activation=input_activation,output_activation=output_activation, + latent_dim=latent_dim, + nb_epochs=epochs, + batch_size=batch, + y_train=infer_model[features_key][mod2]["TRAIN"], + y_dev=infer_model[features_key][mod2]["DEV"], + y_test=infer_model[features_key][mod2]["TEST"]) + + for nb,layer in enumerate(res) : + if save_projection: + pd = pandas.DataFrame(layer[0]) + col_count = (pd.sum(axis=0) != 0) + pd = pd.loc[:,col_count] + pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"TRAIN") + pd = pandas.DataFrame(layer[1]) + pd = pd.loc[:,col_count] + pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"DEV") + pd = pandas.DataFrame(layer[2]) + pd = pd.loc[:,col_count] + pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"TEST") + del pd + + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], + layer[1],infer_model["LABEL"][mod]["DEV"], + layer[2],infer_model["LABEL"][mod]["TEST"], + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, + output_activation=mlp_output_activation, + input_activation=input_activation, + batch_size=mlp_batch_size,fit_verbose=0)) + + db["VAE"]["SPE"] = mlp_res_list db.sync() db.close() diff --git a/LDA/05-lts_scoring.py b/LDA/05-lts_scoring.py new file mode 100644 index 0000000..9585d75 --- /dev/null +++ b/LDA/05-lts_scoring.py @@ -0,0 +1,80 @@ +import sys +import shelve +import pickle +from utils import * +import sys +import os +import json +import glob +import tempfile +import pandas +import subprocess +from subprocess import CalledProcessError +import shutil +import numpy + +in_dir = sys.argv[1] +json_conf =json.load(open(sys.argv[2])) +name = json_conf["name"] + +ae_m = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name)) +y_train=numpy.argmax(ae_m["LABEL"]["ASR"]["TRAIN"],axis=1) +_,ytr_path=tempfile.mkstemp() +ytr_open= open(ytr_path,"w") +for el in y_train: + print >>ytr_open, el +ytr_open.close() + +y_dev=numpy.argmax(ae_m["LABEL"]["ASR"]["DEV"],axis=1) +_,yd_path=tempfile.mkstemp() +yd_open = open(yd_path,"w") +for el in y_dev: + print >>yd_open, el +yd_open.close() + +y_test=numpy.argmax(ae_m["LABEL"]["ASR"]["TEST"],axis=1) +_,yte_path=tempfile.mkstemp() +yte_open=open(yte_path,"w") +for el in y_test: + print >>yte_open, el +yte_open.close() + +hdfs_files=glob.glob("{}/{}/*.hdf".format(in_dir,name)) +temp_dir=tempfile.mkdtemp() +out_file=open("{}/{}/malaha_res.txt".format(in_dir,name),"a") + +for hdf in hdfs_files: + print >>out_file, "Start ---------------------------------------------------" + print >>out_file, hdf + x_train = pandas.read_hdf(hdf,"TRAIN") + x_train.to_csv("{}/xtrain.dat".format(temp_dir),sep=" ",header=False,index=False, index_label=False) + x_train = pandas.read_hdf(hdf,"DEV") + x_train.to_csv("{}/xdev.dat".format(temp_dir),sep=" ",header=False,index=False, index_label=False) + x_train = pandas.read_hdf(hdf,"TEST") + x_train.to_csv("{}/xtest.dat".format(temp_dir),sep=" ",header=False,index=False, index_label=False) + try : + resdev=subprocess.check_output(['Rscript', + '/home/laboinfo/janod/WorkingDir/erreur_traduction/Author_Topic_Decoda/estimate.R', + "{}/xtrain.dat".format(temp_dir), + "{}/xdev.dat".format(temp_dir), + ytr_path,yd_path]) + + restest=subprocess.check_output(['Rscript', + '/home/laboinfo/janod/WorkingDir/erreur_traduction/Author_Topic_Decoda/estimate.R', + "{}/xtrain.dat".format(temp_dir), + "{}/xtest.dat".format(temp_dir), + ytr_path,yte_path]) + + print >>out_file, resdev + print >>out_file, hdf + print >>out_file, restest + except CalledProcessError: + print >>out_file, "FAILED" + print >>out_file, hdf + print >>out_file, "End ---------------------------------------------------" + +shutil.rmtree(temp_dir) +os.remove(ytr_path) +os.remove(yd_path) +os.remove(yte_path) + diff --git a/LDA/mlp.py b/LDA/mlp.py index 7e8e2cb..c83db76 100755 --- a/LDA/mlp.py +++ b/LDA/mlp.py @@ -82,7 +82,6 @@ def ft_dsae(train,dev,test, def train_mlp(x_train,y_train,x_dev,y_dev,x_test,y_test,hidden_size,input_activation="relu",hidden_activation="relu",output_activation="softmax",loss="mse",init="glorot_uniform",dropouts=None,sgd=None,epochs=1200,batch_size=16,fit_verbose=1,test_verbose=0,save_pred=False,keep_histo=False): - layers = [Input(shape=(x_train.shape[1],))] for h in hidden_size: diff --git a/LDA/utils.py b/LDA/utils.py index c901e37..627c87e 100644 --- a/LDA/utils.py +++ b/LDA/utils.py @@ -1,6 +1,10 @@ # -*- coding: utf-8 -*- import nltk import re +import codecs +import numpy as np +import sqlite3 + pattern = ur"\d+(?:\.\d+)?\s*%?|\w{1,2}'||[\wéàèùêôûâòìîç]+|[^\w\s]" rer_b = re.compile(ur" r e r(?: e r)? b ") rer_c = re.compile(ur" r e r(?: e r)? c |r e r( e r)? c' est | rer c' est") @@ -43,3 +47,58 @@ def select(elm): def select_mmf(elm): return int(elm.split("_")[0]) + +def get_score(table): + mx_train = np.max(table[0]) + argmx_dev = np.argmax(table[1]) + mx_dev = table[1][argmx_dev] + best_test = table[2][argmx_dev] + mx_test = np.max(table[2]) + print """\tmax train : {} + \tmax dev : {} + \tmax test : {} - best test : {} + \t best epochs : {}""".format(mx_train,mx_dev,mx_test,best_test,argmx_dev) + return mx_train,mx_dev,mx_test,best_test,argmx_dev +class WeightedWordsList : + @staticmethod + def get_key(wtuple): + return wtuple[1] + @staticmethod + def get_okey(wtuple): + return wtuple[1][1] + + + def __init__(self,file_path): + self.wlist = codecs.open(file_path,"r","utf8").readlines() + self.wlist = [x.strip().split(':') for x in self.wlist ] + self.wlist = [ (x, float(y)) for x,y in self.wlist ] + self.wdict = {} + for x,y in self.wlist: + self.wdict[x.encode("utf8")] = y + + def select_best(self,word_list,lenght=5): + scored_word = [] + for w in word_list: + w = w.encode("utf8") + if w not in self.wdict : + continue + + if len(scored_word) < lenght: + scored_word.append((w,self.wdict[w])) + else : + w_min= min(enumerate(scored_word),key=WeightedWordsList.get_okey) + w_curr = (w, self.wdict[w]) + if w_min[1][1] < w_curr[1]: + del scored_word[w_min[0]] + scored_word.append(w_curr) + w_min = min(enumerate(scored_word),key=WeightedWordsList.get_okey) + while len(scored_word) > lenght and w_min[1][1] < w_curr[1] : + del scored_word[w_min[0]] + w_min = min(enumerate(scored_word),key=WeightedWordsList.get_okey) + elif w_min[1][1] == w_curr[1]: + scored_word.append(w_curr) + return [ w[0] for w in scored_word ] + + + + diff --git a/LDA/vae.py b/LDA/vae.py index b846e53..4a8f858 100644 --- a/LDA/vae.py +++ b/LDA/vae.py @@ -16,14 +16,58 @@ from keras.models import Model from keras import backend as K from keras import objectives from keras.datasets import mnist +from keras.callbacks import EarlyStopping,Callback import pandas import shelve import pickle - - +class ZeroStopping(Callback): + '''Stop training when a monitored quantity has stopped improving. + # Arguments + monitor: quantity to be monitored. + patience: number of epochs with no improvement + after which training will be stopped. + verbose: verbosity mode. + mode: one of {auto, min, max}. In 'min' mode, + training will stop when the quantity + monitored has stopped decreasing; in 'max' + mode it will stop when the quantity + monitored has stopped increasing. + ''' + def __init__(self, monitor='val_loss', verbose=0, mode='auto', thresh = 0): + super(ZeroStopping, self).__init__() + + self.monitor = monitor + self.verbose = verbose + self.thresh = thresh # is a rythme + + if mode not in ['auto', 'min', 'max']: + warnings.warn('EarlyStopping mode %s is unknown, ' + 'fallback to auto mode.' % (self.mode), + RuntimeWarning) + mode = 'auto' + + if mode == 'min': + self.monitor_op = np.less + elif mode == 'max': + self.monitor_op = np.greater + else: + if 'acc' in self.monitor: + self.monitor_op = np.greater + else: + self.monitor_op = np.less + + def on_epoch_end(self, epoch, logs={}): + current = logs.get(self.monitor) + if current is None: + warnings.warn('Zero stopping requires %s available!' % + (self.monitor), RuntimeWarning) + + if self.monitor_op(current, self.thresh): + self.best = current + self.model.stop_training = True #batch_size = 16 #original_dim = 784 @@ -82,8 +126,11 @@ def train_vae(x_train,x_dev,x_test,y_train=None,y_dev=None,y_test=None,hidden_si vae.fit(x_train, y_train, shuffle=True, nb_epoch=nb_epochs, + verbose = 1, batch_size=batch_size, - validation_data=(x_dev, y_dev)) + validation_data=(x_dev, y_dev), + callbacks = [ZeroStopping(monitor='val_loss', thresh=0, verbose=0, mode='min')] + ) # build a model to project inputs on the latent space encoder = Model(x, z_mean) -- 1.8.2.3