add vae et mmf

Killian
1 parent b6d0165d16
Showing 13 changed files with 1084 additions and 44 deletions Side-by-side Diff
LDA/00-mmf_make_features.py
LDA/02-lda.py
LDA/03-mono_perplex.py
LDA/03-perplex.py
LDA/04a-mmdf.py
LDA/04b-mmf_mini_ae.py
LDA/04c-mmf_sae.py
LDA/04d-mmf_dsae.py
LDA/04e-mm_vae.py
LDA/05-mmf_getscore.py
LDA/run2.sh
LDA/utils.py
LDA/vae.py
+import sys 
+import os 
+
+import pandas 
+import numpy 
+import shelve
+
+from sklearn.preprocessing import LabelBinarizer
+
+from utils import select_mmf as select
+
+input_dir = sys.argv[1] # Dossier de premire niveau contient ASR et TRS
+level = sys.argv[2] # taille de LDA ( -5) voulu 
+
+lb=LabelBinarizer()
+#y_train=lb.fit_transform([utils.select(ligneid) for ligneid in origin_corps["LABEL"]["TRAIN"]])
+
+
+data = shelve.open("{}/mmf_{}.shelve".format(input_dir,level))
+data["LABEL"]= {"LDA":{}}
+for mod in ["ASR", "TRS" ]
+    train = pandas.read_table("{}/{}/train_{}.ssv".format(input_dir, mod, level), sep=" ", header=None )
+    dev = pandas.read_table("{}/{}/dev_{}.ssv".format(input_dir, mod, level), sep=" ", header=None )
+    test = pandas.read_table("{}/{}/test_{}.ssv".format(input_dir, mod, level), sep=" ", header=None )
+
+    y_train = train.iloc[:,0].apply(select)
+    y_dev = dev.iloc[:,0].apply(select)
+    y_test = test.iloc[:,0].apply(select)
+    lb.fit(y_train)
+    data["LABEL"][mod]={"TRAIN":lb.transform(y_train),"DEV":lb.transform(y_dev), "TEST": lb.transform(y_test)}
+
+    data["LDA"][mod]={}
+    data["LDA"][mod]["TRAIN"]=train.iloc[:,1:].values
+    data["LDA"][mod]["DEV"]=dev.iloc[:,1:].values
+    data["LDA"][mod]["TEST"]=test.iloc[:,1:].values
+
+data.sync()
+data.close()
@@ -12,10 +12,11 @@
 import dill
 from tinydb import TinyDB, where, Query
 import time
+from joblib import Parallel, delayed
  
 def calc_perp(models,train):
  
-    
+
     stop_words=models[1]
     name = models[0]
  
@@ -45,7 +46,8 @@
 def train_lda(out_dir,train,size,it,sw_size,alpha,eta,passes,chunk):
     name = "s{}_it{}_sw{}_a{}_e{}_p{}_c{}".format(size,it,sw_size,alpha,eta,passes,chunk)
     logging.warning(name)
-    if os.path.isfile(out_dir+"/"+name+".dill"):
+    deep_out_dir = out_dir+"/"+name
+    if os.path.isdir(deep_out_dir):
         logging.error(name+" already done")
         return 
     logging.warning(name+" to be done")
@@ -54,7 +56,6 @@
     asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ]
     trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ]
     stop_words=set(asr_sw) | set(trs_sw)
-    stop_words=[ x.strip() for x in open("french.txt").readlines() ]
  
     logging.warning("TRS  to be done")
  
  
  
  
@@ -68,19 +69,42 @@
     asr_probs = []
     for line in lda_asr.expElogbeta:
         nline = line / np.sum(line)
-        asr_probs.append( str(x) for x in nline)
+        asr_probs.append([ str(x) for x in nline])
     trs_probs = []
     for line in lda_trs.expElogbeta:
         nline = line / np.sum(line)
-        trs_probs.append( str(x) for x in nline)
+        trs_probs.append([str(x) for x in nline])
  
     K = lda_asr.num_topics
     topicWordProbMat_asr = lda_asr.print_topics(K,10)
  
     K = lda_trs.num_topics
     topicWordProbMat_trs = lda_trs.print_topics(K,10)
+    os.mkdir(deep_out_dir)
+    dill.dump([x for x in stop_words],open(deep_out_dir+"/stopwords.dill","w"))
+    lda_asr.save(deep_out_dir+"/lda_asr.model")
+    lda_trs.save(deep_out_dir+"/lda_trs.model")
+    dill.dump([x for x in asr_probs],open(deep_out_dir+"/lda_asr_probs.dill","w"))
+    dill.dump([x for x in trs_probs],open(deep_out_dir+"/lda_trs_probs.dill","w"))
+
     return [name, stop_words, lda_asr , asr_probs , topicWordProbMat_asr, lda_trs, trs_probs, topicWordProbMat_trs]
  
+def train_one(name,train,s,i,sw,a,e,p,c):
+    st=time.time()
+    logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]]))
+    models = train_lda(name,train,s,i,sw,a,e,p,c)
+    if models:
+        m = calc_perp(models,train)
+        #dill.dump(models,open("{}/{}.dill".format(name,models[0]),"wb"))
+    else : 
+        m = None
+    e = time.time()
+    logging.warning("fin en : {}".format(e-st))
+    return m
+
+
+
+
 if __name__ == "__main__": 
     logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
  
@@ -109,6 +133,8 @@
     db  = TinyDB(db_path)
     nb_model = len(passes) * len(chunk) * len(it) * len(sw_size) * len(alpha) * len(eta) * len(size)
     logging.warning(" hey will train {} models ".format(nb_model))
+
+    args_list=[]
     for p in passes:
         for c in chunk: 
             for i in it :
@@ -116,13 +142,9 @@
                     for a in alpha:
                         for e in eta:
                             for s in size: 
-                                st=time.time()
-                                logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]]))
-                                models = train_lda(name,train,s,i,sw,a,e,p,c)
-                                if models:
-                                    m = calc_perp(models,train)
-                                    dill.dump(models,open("{}/{}.dill".format(name,models[0]),"wb"))
-                                db.insert(m)
-                                e = time.time()
-                                logging.warning("fin en : {}".format(e-st))
+                               args_list.append((name,train,s,i,sw,a,e,p,c))
+    res_list= Parallel(n_jobs=15)(delayed(train_one)(*args) for args in args_list)
+    for m in res_list :
+        db.insert(m)
+        
@@ -52,7 +52,7 @@
     input_dir = sys.argv[2]
     db_path = sys.argv[3]
     logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
-    folders = glob.glob("{}/*".format(input_dir))
+    folders = glob.glob("{}/s*".format(input_dir))
  
     #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir)))
     train = shelve.open(input_shelve)
@@ -22,40 +22,43 @@
  
  
 def calc_perp(params):
-    in_dir,train = params
-    name = in_dir.split("/")[-1]
-    # s40_it1_sw50_a0.01_e0.1_p6_c1000
+    try:
+        in_dir,train = params
+        name = in_dir.split("/")[-1]
+        # s40_it1_sw50_a0.01_e0.1_p6_c1000
  
-    entry = Query()
-    value=db.search(entry.name == name)
-    if len(value) > 0 :
-        logging.warning("{} already done".format(name))
-        return 
+        entry = Query()
+        value=db.search(entry.name == name)
+        if len(value) > 0 :
+            logging.warning("{} already done".format(name))
+            return 
  
-    sw_size = int(name.split("_")[2][2:])
+        sw_size = int(name.split("_")[2][2:])
  
-    logging.warning(" go {} ".format(name))
+        logging.warning(" go {} ".format(name))
  
  
-    logging.warning("Redo Vocab and stop")
-    asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y])
-    trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y])
-    asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ]
-    trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ]
-    stop_words=set(asr_sw) | set(trs_sw)
+        logging.warning("Redo Vocab and stop")
+        asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y])
+        trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y])
+        asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ]
+        trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ]
+        stop_words=set(asr_sw) | set(trs_sw)
  
-    logging.warning("TRS  to be done")
-    
-    dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]]
-    lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir))
-    perp_trs = lda_trs.log_perplexity(dev_trs)
-    logging.warning("ASR  to be done")
-    dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]]
-    lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir))
-    perp_asr = lda_asr.log_perplexity(dev_asr)
-    logging.warning("ASR  saving")
-    res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs}
-    return res_dict
+        logging.warning("TRS  to be done")
+        
+        dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]]
+        lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir))
+        perp_trs = lda_trs.log_perplexity(dev_trs)
+        logging.warning("ASR  to be done")
+        dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]]
+        lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir))
+        perp_asr = lda_asr.log_perplexity(dev_asr)
+        logging.warning("ASR  saving")
+        res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs}
+        return res_dict
+    except :
+        return { "name" : name }
  
 if __name__ == "__main__": 
     input_shelve = sys.argv[1]
+
+# coding: utf-8
+
+# In[29]:
+
+# Import
+import itertools
+import shelve
+import pickle
+import numpy
+import scipy
+from scipy import sparse
+import scipy.sparse
+import scipy.io
+from mlp import *
+import mlp
+import sys
+import utils
+import dill
+from collections import Counter
+from gensim.models import LdaModel
+
+
+
+# In[3]:
+
+#30_50_50_150_0.0001
+
+# In[4]:
+
+#db=shelve.open("SPELIKE_MLP_DB.shelve",writeback=True)
+origin_corps=shelve.open("{}".format(sys.argv[2]))
+in_dir = sys.argv[1]
+
+
+out_db=shelve.open("{}/mlp_scores.shelve".format(in_dir),writeback=True)
+
+mlp_h = [ 250, 250  ]
+mlp_loss = "categorical_crossentropy"
+mlp_dropouts = [0.25]* len(mlp_h)
+mlp_sgd = Adam(lr=0.0001)
+mlp_epochs = 3000
+mlp_batch_size = 1
+mlp_input_activation = "relu"
+mlp_output_activation="softmax"
+
+ress = []
+for key in ["TRS", "ASR"] :
+
+    res=mlp.train_mlp(origin_corps["LDA"][key]["TRAIN"],origin_corps["LABEL"][key]["TRAIN"],
+            origin_corps["LDA"][key]["DEV"],origin_corps["LABEL"][key]["DEV"],
+            origin_corps["LDA"][key]["TEST"],origin_corps["LABEL"][key]["TEST"],
+            mlp_h,dropouts=mlp_dropouts,sgd=mlp_sgd,
+            epochs=mlp_epochs,
+            batch_size=mlp_batch_size,
+            save_pred=False,keep_histo=False,
+            loss="categorical_crossentropy",fit_verbose=0)
+    arg_best=[]
+    dev_best=[]
+    arg_best.append(numpy.argmax(res[1]))
+    dev_best.append(res[1][arg_best[-1]])
+    res[1][arg_best[-1]]=0
+    arg_best.append(numpy.argmax(res[1]))
+    dev_best.append(res[1][arg_best[-1]])
+    res[1][arg_best[-1]]=0
+    arg_best.append(numpy.argmax(res[1]))
+    dev_best.append(res[1][arg_best[-1]])
+    res[1][arg_best[-1]]=0
+    arg_best.append(numpy.argmax(res[1]))
+    dev_best.append(res[1][arg_best[-1]])
+    res[1][arg_best[-1]]=0
+    arg_best.append(numpy.argmax(res[1]))
+    dev_best.append(res[1][arg_best[-1]])
+    res[1][arg_best[-1]]=0
+    arg_best.append(numpy.argmax(res[1]))
+    dev_best.append(res[1][arg_best[-1]])
+    res[1][arg_best[-1]]=0
+    arg_best.append(numpy.argmax(res[1]))
+    dev_best.append(res[1][arg_best[-1]])
+    res[1][arg_best[-1]]=0
+    arg_best.append(numpy.argmax(res[1]))
+    dev_best.append(res[1][arg_best[-1]])
+    res[1][arg_best[-1]]=0
+    arg_best.append(numpy.argmax(res[1]))
+    dev_best.append(res[1][arg_best[-1]])
+    res[1][arg_best[-1]]=0
+    arg_best.append(numpy.argmax(res[1]))
+    dev_best.append(res[1][arg_best[-1]])
+    res[1][arg_best[-1]]=0
+    arg_best.append(numpy.argmax(res[1]))
+    dev_best.append(res[1][arg_best[-1]])
+    res[1][arg_best[-1]]=0
+    arg_best.append(numpy.argmax(res[1]))
+    dev_best.append(res[1][arg_best[-1]])
+    res[1][arg_best[-1]]=0
+
+
+
+
+    test_best =[  res[2][x] for x in arg_best ]
+    test_max = numpy.max(res[2])
+    out_db[key]=(res,(dev_best,test_best,test_max))
+    ress.append((key,dev_best,test_best,test_max))
+    
+for el in ress :
+    print el
+out_db.close()
+origin_corps.close()
+
+# coding: utf-8
+
+# In[2]:
+
+# Import
+import gensim
+from scipy import sparse
+import itertools
+from sklearn import preprocessing
+from keras.models import Sequential
+from keras.optimizers import SGD,Adam
+from mlp import *
+import sklearn.metrics
+import shelve
+import pickle
+from utils import *
+import sys
+import os
+import json
+# In[4]:
+
+infer_model=shelve.open("{}".format(sys.argv[2]))
+in_dir = sys.argv[1]
+#['ASR', 'TRS', 'LABEL']
+# In[6]:
+
+
+hidden_size=[ 100 , 50, 100 ]
+input_activation="tanh"
+output_activation="tanh"
+loss="mse"
+epochs=1000
+batch=1
+patience=60
+do_do=[False]
+sgd = Adam(lr=0.000001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True)
+
+
+
+mlp_h = [ 150 ,150 ,150 ]
+mlp_loss = "categorical_crossentropy"
+mlp_dropouts = []
+mlp_sgd = Adam(lr=0.0001)
+mlp_epochs = 2000
+mlp_batch_size = 8
+mlp_output_activation="softmax"
+
+try :
+    sgd_repr=sgd.get_config()["name"]
+except AttributeError :
+    sgd_repr=sgd
+
+try :
+    mlp_sgd_repr=mlp_sgd.get_config()["name"]
+except AttributeError :
+    mlp_sgd_repr=mlp_sgd
+
+
+params={ "h1" : "_".join([ str(x) for x in hidden_size ]),
+	"inside_activation" : input_activation,
+	"output_activation" : output_activation,
+        "do_dropout": "_".join([str(x) for x in do_do]),
+	"loss" : loss,
+	"epochs" : epochs ,
+	"batch_size" : batch,
+	"patience" : patience,
+        "sgd" : sgd_repr,
+        "mlp_h ": "_".join([str(x) for x in mlp_h]),
+        "mlp_loss ": mlp_loss,
+        "mlp_dropouts ": "_".join([str(x) for x in mlp_dropouts]),
+        "mlp_sgd ": mlp_sgd_repr,
+        "mlp_epochs ": mlp_epochs,
+        "mlp_batch_size ": mlp_batch_size,
+        "mlp_output" : mlp_output_activation
+        }
+name = "_".join([ str(x) for x in params.values()])
+try:
+    os.mkdir("{}/{}".format(in_dir,name))
+except:
+    pass
+db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True)
+db["params"] = params
+db["LABEL"]=infer_model["LABEL"]
+#
+json.dump(params,
+	open("{}/{}/ae_model.json".format(in_dir,name),"w"),
+	indent=4)
+
+keys = ["ASR","TRS"]
+
+db["AE"] = {}
+db["LDA"] = {}
+for mod in keys : 
+    print mod
+    db["LDA"][mod] = train_mlp(infer_model["LDA"][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"],
+                            infer_model["LDA"][mod]["DEV"],infer_model["LABEL"][mod]["DEV"],
+                            infer_model["LDA"][mod]["TEST"],infer_model["LABEL"][mod]["TEST"],
+                            mlp_h ,sgd=mlp_sgd,
+                            epochs=mlp_epochs,
+                            batch_size=mlp_batch_size,
+                            input_activation=input_activation,
+                            output_activation=mlp_output_activation,
+                            dropouts=mlp_dropouts,
+                            fit_verbose=0)
+
+    res=train_ae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"],
+                 hidden_size,patience = params["patience"],sgd=sgd,
+                 dropouts=do_do,input_activation=input_activation,output_activation=output_activation,
+                 loss=loss,epochs=epochs,batch_size=batch,verbose=0)
+    mlp_res_list=[]
+    for layer in res :
+        mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"],
+                                      layer[1],infer_model["LABEL"][mod]["DEV"],
+                                      layer[2],infer_model["LABEL"][mod]["TEST"],
+                                      mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,
+                                      output_activation=mlp_output_activation,
+                                      input_activation=input_activation,
+                                      batch_size=mlp_batch_size,fit_verbose=0))
+    db["AE"][mod]=mlp_res_list
+
+mod = "ASR"
+mod2= "TRS"
+mlp_res_list=[]
+
+res = train_ae(infer_model["LDA"][mod]["TRAIN"],
+                infer_model["LDA"][mod]["DEV"],
+                infer_model["LDA"][mod]["TEST"],
+                hidden_size,dropouts=do_do,patience = params["patience"],
+                sgd=sgd,input_activation=input_activation,output_activation=output_activation,loss=loss,epochs=epochs,
+                batch_size=batch,
+                y_train=infer_model["LDA"][mod]["TRAIN"],
+                y_dev=infer_model["LDA"][mod2]["DEV"],
+                y_test=infer_model["LDA"][mod2]["TEST"])
+
+for layer in res :
+    mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"],
+                                  layer[1],infer_model["LABEL"][mod]["DEV"],
+                                  layer[2],infer_model["LABEL"][mod]["TEST"],
+                                  mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,
+                                  output_activation=mlp_output_activation,
+                                  input_activation=input_activation,
+                                  batch_size=mlp_batch_size,fit_verbose=0))
+
+db["AE"]["SPE"] = mlp_res_list
+
+db.sync()
+db.close()
+
+# coding: utf-8
+
+# In[2]:
+
+# Import
+import gensim
+from scipy import sparse
+import itertools
+from sklearn import preprocessing
+from keras.models import Sequential
+from keras.optimizers import SGD,Adam
+from mlp import *
+import mlp
+import sklearn.metrics
+import shelve
+import pickle
+from utils import *
+import sys
+import os
+import json
+# In[4]:
+
+infer_model=shelve.open("{}".format(sys.argv[2]))
+in_dir = sys.argv[1]
+#['ASR', 'TRS', 'LABEL']
+# In[6]:
+
+
+hidden_size=[ 100,  80, 50 , 20 ]
+input_activation="relu"
+output_activation="relu"
+loss="mse"
+epochs=3000
+batch=1
+patience=20
+do_do=[ 0 ] * len(hidden_size)
+sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True)
+try :
+    sgd_repr=sgd.get_config()["name"]
+except AttributeError :
+    sgd_repr=sgd
+
+params={ "h1" : "_".join([str(x) for x in hidden_size]),
+	"inside_activation" : input_activation,
+	"out_activation" : output_activation,
+        "do_dropout": "_".join([str(x) for x in do_do]),
+	"loss" : loss,
+	"epochs" : epochs ,
+	"batch_size" : batch,
+	"patience" : patience,
+        "sgd" : sgd_repr}
+name = "_".join([ str(x) for x in params.values()])
+try:
+    os.mkdir("{}/SAE_{}".format(in_dir,name))
+except:
+    pass
+db = shelve.open("{}/SAE_{}/ae_model.shelve".format(in_dir,name),writeback=True)
+#
+json.dump(params,
+	open("{}/SAE_{}/ae_model.json".format(in_dir,name),"w"),
+	indent=4)
+
+keys = ["ASR","TRS"]
+
+mlp_h = [ 150 , 300 ]
+mlp_loss ="categorical_crossentropy"
+mlp_dropouts = [0,0,0,0]
+mlp_sgd = Adam(0.001)
+mlp_epochs = 2000
+mlp_batch_size = 8
+
+db["SAE"] = {}
+
+db["SAEFT"] = {}
+for mod in keys : 
+    print "MODE ", mod
+    res_tuple=train_sae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],
+                 infer_model["LDA"][mod]["TEST"],
+                 hidden_size,dropouts=do_do,
+                 patience = params["patience"],sgd=sgd,input_activation="tanh",
+                 output_activation="tanh",loss=loss,epochs=epochs,
+                 batch_size=batch,verbose=0)
+    #print len(res), [len(x) for x in res[0]], [ len(x) for x in res[1]]
+    for name , levels in zip(["SAE","SAEFT"],res_tuple):
+        print "NAME", name
+        mlp_res_by_level = []
+        for res in levels:
+            mlp_res_list=[]
+            for nb,layer in enumerate(res) :
+                print "layer NB",nb
+                mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"],
+                                              layer[1],infer_model["LABEL"][mod]["DEV"],
+                                              layer[2],infer_model["LABEL"][mod]["TEST"],
+                                              mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,
+                                              sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size,
+                                              fit_verbose=0))
+            mlp_res_by_level.append(mlp_res_list)
+        db[name][mod]=mlp_res_by_level
+
+mod = "ASR"
+mod2= "TRS"
+print "mode SPE "
+res_tuple = train_sae(infer_model["LDA"][mod]["TRAIN"],
+                      infer_model["LDA"][mod]["DEV"],
+                      infer_model["LDA"][mod]["TEST"],
+                      hidden_size,dropouts=[0],patience=params["patience"],
+                      sgd=sgd,input_activation=input_activation,output_activation=input_activation,
+                      loss=loss,epochs=epochs,batch_size=batch,
+                      y_train=infer_model["LDA"][mod2]["TRAIN"],
+                      y_dev=infer_model["LDA"][mod2]["DEV"],
+                      y_test=infer_model["LDA"][mod2]["TEST"])
+
+for name , levels in zip(["SAE","SAEFT"],res_tuple):
+    mlp_res_by_level = []
+    for res in levels : 
+        mlp_res_list=[]
+        for layer in res :
+            mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"],
+                                layer[1],infer_model["LABEL"][mod]["DEV"],layer[2],
+                                infer_model["LABEL"][mod]["TEST"],
+                                mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,
+                                sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size,
+                                fit_verbose=0))
+        mlp_res_by_level.append(mlp_res_list)
+    db[name]["SPE"] = mlp_res_by_level
+
+db.close()
+
+# coding: utf-8
+
+# In[2]:
+
+# Import
+import gensim
+from scipy import sparse
+import itertools
+from sklearn import preprocessing
+from keras.models import Sequential
+from keras.optimizers import SGD,Adam
+from mlp import *
+import mlp
+import sklearn.metrics
+import shelve
+import pickle
+from utils import *
+import sys
+import os
+import json
+# In[4]:
+
+infer_model=shelve.open("{}".format(sys.argv[2]))
+in_dir = sys.argv[1]
+#['ASR', 'TRS', 'LABEL']
+# In[6]:
+
+# AE params 
+hidden_size=[ 100, 100  ]
+input_activation="relu"
+output_activation="relu"
+loss="mse"
+epochs= 1000
+batch_size=1
+patience=20
+do_do=[ 0.25 ] * len(hidden_size)
+sgd = Adam(lr=0.00001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True)
+try :
+    sgd_repr=sgd.get_config()["name"]
+except AttributeError :
+    sgd_repr=sgd
+
+# Transforme :
+trans_hidden_size=[ 300 , 300 ]
+trans_input_activation="relu"
+trans_output_activation="relu"
+trans_loss="mse"
+trans_epochs=1000
+trans_batch_size=8
+trans_patience=20
+trans_do=[ 0.25 ] * len(trans_hidden_size)
+trans_sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True)
+try :
+    trans_sgd_repr=trans_sgd.get_config()["name"]
+except AttributeError :
+    trans_sgd_repr=trans_sgd
+
+
+
+ae={ "h1" : "_".join([str(x) for x in hidden_size]),
+	"inside_activation" : input_activation,
+	"out_activation" : output_activation,
+        "do_dropout": "_".join([str(x) for x in do_do]),
+	"loss" : loss,
+	"epochs" : epochs ,
+	"batch_size" : batch_size,
+	"patience" : patience,
+        "sgd" : sgd_repr}
+name = "_".join([ str(x) for x in ae.values()])
+
+trans={ "h1" : "_".join([str(x) for x in trans_hidden_size]),
+	"inside_activation" : trans_input_activation,
+	"out_activation" : trans_output_activation,
+        "do_dropout": "_".join([str(x) for x in trans_do]),
+	"loss" : trans_loss,
+	"epochs" : trans_epochs ,
+	"batch_size" : trans_batch_size,
+	"patience" : trans_patience,
+        "sgd" : trans_sgd_repr}
+
+mlp_h = [ 300 , 300 ]
+mlp_loss ="categorical_crossentropy"
+mlp_dropouts = [0,0,0,0]
+mlp_sgd = Adam(0.0001)
+mlp_epochs = 1000
+mlp_batch_size = 8
+mlp_input_activation = "relu"
+mlp_output_activation = "softmax"
+
+try :
+    mlp_sgd_repr=mlp_sgd.get_config()["name"]
+except AttributeError :
+    mlp_sgd_repr=mlp_sgd
+
+
+
+mlp={ "h1" : "_".join([str(x) for x in mlp_h ]),
+	"inside_activation" : mlp_input_activation,
+	"out_activation" : mlp_output_activation,
+        "do_dropout": "_".join([str(x) for x in mlp_dropouts]),
+	"loss" : mlp_loss,
+	"epochs" : mlp_epochs ,
+	"batch_size" : mlp_batch_size,
+        "sgd" : mlp_sgd_repr}
+
+params = { "ae":ae, "trans":trans, "mlp":mlp}
+try:
+    os.mkdir("{}/DSAE_{}".format(in_dir,name))
+except:
+    pass
+db = shelve.open("{}/DSAE_{}/ae_model.shelve".format(in_dir,name),writeback=True)
+#
+json.dump(params,
+	open("{}/DSAE_{}/ae_model.json".format(in_dir,name),"w"),
+	indent=4)
+
+keys = ["ASR","TRS"]
+
+
+
+db["DSAE"] = {}
+
+db["DSAEFT"] = {}
+mod = "ASR"
+res_tuple_ASR = train_ae(infer_model["LDA"][mod]["TRAIN"],
+                         infer_model["LDA"][mod]["DEV"],
+                         infer_model["LDA"][mod]["TEST"],
+                         hidden_size,dropouts=do_do,
+                         patience = patience,sgd=sgd,
+                         input_activation=input_activation,
+                         output_activation=output_activation,loss=loss,epochs=epochs,
+                         batch_size=batch_size,verbose=0,get_weights=True)
+mlp_res_list = []
+for layer in res_tuple_ASR[0]: 
+    mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"],
+                                  layer[1],infer_model["LABEL"][mod]["DEV"],
+                                  layer[2],infer_model["LABEL"][mod]["TEST"],
+                                  mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,
+                                  sgd=mlp_sgd,epochs=mlp_epochs,
+                                  output_activation=mlp_output_activation,
+                                  input_activation=mlp_input_activation,
+                                  batch_size=mlp_batch_size,fit_verbose=0))
+
+db["DSAE"][mod] = mlp_res_list
+mod = "TRS"
+print hidden_size
+res_tuple_TRS = train_ae(infer_model["LDA"][mod]["TRAIN"],
+                         infer_model["LDA"][mod]["DEV"],
+                         infer_model["LDA"][mod]["TEST"],
+                         hidden_size,dropouts=do_do,
+                         sgd=sgd,input_activation=input_activation,
+                         output_activation=output_activation,loss=loss,epochs=epochs,
+                         batch_size=batch_size,patience=patience,
+                         verbose=0,get_weights=True)
+
+mlp_res_list = []
+for layer in res_tuple_TRS[0]: 
+    mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"],
+                                  layer[1],infer_model["LABEL"][mod]["DEV"],
+                                  layer[2],infer_model["LABEL"][mod]["TEST"],
+                                  mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,
+                                  sgd=mlp_sgd,epochs=mlp_epochs,
+                                  output_activation=mlp_output_activation,
+                                  input_activation=mlp_input_activation,
+                                  batch_size=mlp_batch_size,fit_verbose=0))
+
+db["DSAE"][mod] = mlp_res_list
+
+
+
+transfert = []
+
+print " get weight trans" 
+
+for asr_pred, trs_pred in zip(res_tuple_ASR[0], res_tuple_TRS[0]):
+    print "ASR", [ x.shape for x in asr_pred]
+
+    print "TRS", [ x.shape for x in trs_pred]
+    print
+
+for asr_pred, trs_pred in zip(res_tuple_ASR[0], res_tuple_TRS[0]):
+    print "ASR", [ x.shape for x in asr_pred]
+
+    print "TRS", [ x.shape for x in trs_pred]
+    transfert.append( train_ae(asr_pred[0],
+                               asr_pred[1],
+                               asr_pred[2],
+                               trans_hidden_size,
+                               dropouts=trans_do,
+                               y_train = trs_pred[0],
+                               y_dev=trs_pred[1],
+                               y_test = trs_pred[2],
+                               patience = trans_patience,sgd=trans_sgd,
+                               input_activation=trans_input_activation,
+                               output_activation=trans_output_activation,
+                               loss=trans_loss,
+                               epochs=trans_epochs,
+                               batch_size=trans_batch_size,verbose=0,get_weights=True) )
+mod = "ASR"
+mlp_res_bylvl = []
+print " MLP on transfert "
+for level, w  in transfert  :
+    mlp_res_list = []
+    for layer in level :
+        mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"],
+                                      layer[1],infer_model["LABEL"][mod]["DEV"],
+                                      layer[2],infer_model["LABEL"][mod]["TEST"],
+                                      mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,
+                                      sgd=mlp_sgd,epochs=mlp_epochs,
+                                      output_activation=mlp_output_activation,
+                                      input_activation=mlp_input_activation,
+                                      batch_size=mlp_batch_size,fit_verbose=0))
+    mlp_res_bylvl.append(mlp_res_list)
+db["DSAE"]["transfert"] = mlp_res_bylvl
+
+
+print " FT " 
+WA = res_tuple_ASR[1]
+print "WA", len(WA), [ len(x) for x in WA]
+WT = res_tuple_TRS[1]
+
+print "WT", len(WT), [ len(x) for x in WT]
+Wtr = [ x[1] for x in transfert]
+
+print "Wtr", len(Wtr), [ len(x) for x in Wtr],[ len(x[1]) for x in Wtr]
+
+ft_res = ft_dsae(infer_model["LDA"]["ASR"]["TRAIN"],
+                 infer_model["LDA"]["ASR"]["DEV"],
+                 infer_model["LDA"]["ASR"]["TEST"],
+                 y_train=infer_model["LDA"]["TRS"]["TRAIN"],
+                 y_dev=infer_model["LDA"]["TRS"]["DEV"],
+                 y_test=infer_model["LDA"]["TRS"]["TEST"],
+                 ae_hidden = hidden_size,
+                 transfer_hidden = trans_hidden_size,
+                 start_weights = WA,
+                 transfer_weights = Wtr,
+                 end_weights = WT,
+                 input_activation = input_activation,
+                 output_activation = output_activation,
+                 ae_dropouts= do_do,
+                 transfer_do = trans_do,
+                 sgd =  sgd,
+                 loss = loss ,
+                 patience = patience,
+                 batch_size = batch_size,
+                 epochs= epochs)
+mlps_by_lvls= []
+for level  in ft_res  :
+    mlp_res_list = []
+    for layer in level :
+        mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"],
+                                      layer[1],infer_model["LABEL"][mod]["DEV"],
+                                      layer[2],infer_model["LABEL"][mod]["TEST"],
+                                      mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,
+                                      sgd=mlp_sgd,epochs=mlp_epochs,
+                                      output_activation=mlp_output_activation,
+                                      input_activation=mlp_input_activation,
+                                      batch_size=mlp_batch_size,fit_verbose=0))
+    mlps_by_lvls.append(mlp_res_list)
+
+
+db["DSAEFT"]["transfert"] = mlps_by_lvls
+
+db.close()
+
+# coding: utf-8
+
+# In[2]:
+
+# Import
+import gensim
+from scipy import sparse
+import itertools
+from sklearn import preprocessing
+from keras.models import Sequential
+from keras.optimizers import SGD,Adam
+from mlp import *
+from vae import *
+import sklearn.metrics
+import shelve
+import pickle
+from utils import *
+import sys
+import os
+import json
+# In[4]:
+
+infer_model=shelve.open("{}".format(sys.argv[2]))
+in_dir = sys.argv[1]
+#['ASR', 'TRS', 'LABEL']
+# In[6]:
+
+
+hidden_size= [60]
+input_activation="tanh"
+output_activation="sigmoid"
+epochs=300
+batch=1
+patience=60
+sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True)
+latent_dim = 30
+
+
+
+mlp_h = [ 256 ]
+mlp_loss = "categorical_crossentropy"
+mlp_dropouts = []
+mlp_sgd = Adam(lr=0.001)
+mlp_epochs = 1000
+mlp_batch_size = 16
+mlp_output_activation="softmax"
+
+try :
+    sgd_repr=sgd.get_config()["name"]
+except AttributeError :
+    sgd_repr=sgd
+
+try :
+    mlp_sgd_repr=mlp_sgd.get_config()["name"]
+except AttributeError :
+    mlp_sgd_repr=mlp_sgd
+
+
+params={ "h1" : "_".join([ str(x) for x in hidden_size ]),
+	"inside_activation" : input_activation,
+	"output_activation" : output_activation,
+	"epochs" : epochs ,
+	"batch_size" : batch,
+	"patience" : patience,
+        "sgd" : sgd_repr,
+        "mlp_h ": "_".join([str(x) for x in mlp_h]),
+        "mlp_loss ": mlp_loss,
+        "mlp_dropouts ": "_".join([str(x) for x in mlp_dropouts]),
+        "mlp_sgd ": mlp_sgd_repr,
+        "mlp_epochs ": mlp_epochs,
+        "mlp_batch_size ": mlp_batch_size,
+        "mlp_output" : mlp_output_activation
+        }
+name = "_".join([ str(x) for x in params.values()])
+try:
+    os.mkdir("{}/VAE_{}".format(in_dir,name))
+except:
+    pass
+db = shelve.open("{}/VAE_{}/ae_model.shelve".format(in_dir,name),writeback=True)
+db["params"] = params
+db["LABEL"]=infer_model["LABEL"]
+#
+json.dump(params,
+	open("{}/VAE_{}/ae_model.json".format(in_dir,name),"w"),
+	indent=4)
+
+keys = ["ASR","TRS"]
+
+db["VAE"] = {}
+db["LDA"] = {}
+for mod in keys : 
+    print mod
+    db["LDA"][mod] = train_mlp(infer_model["LDA"][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"],
+                            infer_model["LDA"][mod]["DEV"],infer_model["LABEL"][mod]["DEV"],
+                            infer_model["LDA"][mod]["TEST"],infer_model["LABEL"][mod]["TEST"],
+                            mlp_h ,sgd=mlp_sgd,
+                            epochs=mlp_epochs,
+                            batch_size=mlp_batch_size,
+                            input_activation=input_activation,
+                            output_activation=mlp_output_activation,
+                            dropouts=mlp_dropouts,
+                            fit_verbose=0)
+
+    res=train_vae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"],
+                 hidden_size=hidden_size[0],
+                 latent_dim=latent_dim,sgd=sgd,
+                 input_activation=input_activation,output_activation=output_activation,
+                 nb_epochs=epochs,batch_size=batch)
+    mlp_res_list=[]
+    for layer in res :
+        mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"],
+                                      layer[1],infer_model["LABEL"][mod]["DEV"],
+                                      layer[2],infer_model["LABEL"][mod]["TEST"],
+                                      mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,
+                                      output_activation=mlp_output_activation,
+                                      input_activation=input_activation,
+                                      batch_size=mlp_batch_size,fit_verbose=0))
+    db["VAE"][mod]=mlp_res_list
+
+mod = "ASR"
+mod2= "TRS"
+mlp_res_list=[]
+
+res = train_vae(infer_model["LDA"][mod]["TRAIN"],
+                infer_model["LDA"][mod]["DEV"],
+                infer_model["LDA"][mod]["TEST"],
+                hidden_size=hidden_size[0],
+                sgd=sgd,input_activation=input_activation,output_activation=output_activation,
+                latent_dim=latent_dim,
+                nb_epochs=epochs,
+                batch_size=batch,
+                y_train=infer_model["LDA"][mod2]["TRAIN"],
+                y_dev=infer_model["LDA"][mod2]["DEV"],
+                y_test=infer_model["LDA"][mod2]["TEST"])
+
+for layer in res :
+    mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"],
+                                  layer[1],infer_model["LABEL"][mod]["DEV"],
+                                  layer[2],infer_model["LABEL"][mod]["TEST"],
+                                  mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,
+                                  output_activation=mlp_output_activation,
+                                  input_activation=input_activation,
+                                  batch_size=mlp_batch_size,fit_verbose=0))
+
+db["VAE"]["SPE"] = mlp_res_list
+
+db.sync()
+db.close()
+import numpy as np 
+import shelve 
+import sys
+import glob
+from collections import defaultdict
+from tinydb import TinyDB, Query
+from mako.template import Template
+import time 
+
+def get_best(x):                                                                               
+    argbest=np.argmax(x[1])                                                                    
+    maxdev=x[1][argbest]
+    maxtrain=np.max(x[0])
+    maxtest=np.max(x[2])
+    besttest=x[2][argbest]
+    return ( maxtrain,maxdev,maxtest,besttest) 
+depth = lambda L: isinstance(L, list) and max(map(depth, L))+1
+
+
+template_name = '''
+${name}
+========================
+
+MLP scores : 
+-------------------
+'''
+template_value='''\n\n
+| ${model} ${ttype}   | train    | dev       |max test| best test|
+| -------------------:|:--------:|:---------:|:------:|:--------:|
+% for cpt,line in enumerate(models[model][ttype]):
+| ${cpt}      | ${line[0]} | ${line[1]}  |${line[2]} | ${line[3]} |
+% endfor
+\n
+'''
+
+# ae_model.shelve
+def get_folder_file(x):
+    folder=x.split("/")[1]
+    shelve_file = ".".join(x.split(".")[:-1])
+    return(folder,shelve_file)
+
+in_folder = sys.argv[1]
+
+
+models = defaultdict(dict)
+
+ae_model_list = glob.glob("{}/*/ae_model.shelve.dir".format(in_folder))
+ae_model_list = sorted(ae_model_list)
+ae_model_list= map(get_folder_file,ae_model_list)
+for name , shelve_file in ae_model_list :
+    print Template(template_name).render(name=name)
+    opened_shelve = shelve.open(shelve_file)
+    keys = opened_shelve.keys() 
+    if "LABEL" in keys :
+        keys.remove("LABEL")
+    if "params" in keys:
+        keys.remove("params")
+    to_print = []
+    for working_key in keys:
+        for key in opened_shelve[working_key].keys():
+            table_depth = depth(opened_shelve[working_key][key])
+            if table_depth == 3 :
+                models[working_key][key] = [ get_best(x) for x in opened_shelve[working_key][key] ]
+                to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip())
+            elif table_depth == 2 :
+                models[working_key][key] = [ get_best(opened_shelve[working_key][key]) ]
+                to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip())
+            elif table_depth == 4 : 
+                for layer in opened_shelve[working_key][key] :
+                    models[working_key][key] = [ get_best(x) for x in layer ]
+                    to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip())
+    print "\n".join(to_print)
 #python 00-prepross.py 
-python 02-lda-order.py DECODA_list_wid.shelve output_v5/perplex.db 50 10 output_v5 50_500 0 1_0.1 1_0.1 500_1000 100_2000
+python 02-lda.py DECODA_list_wid.shelve output_v5/perplex.db 50 10 output_v5 50_500 0 1_0.1 1_0.1 500_1000 100_2000
 #python 03-perplex.py DECODA_list_wid.shelve output_v5 output_v5/perplex.db
 python 03-order_by_perp.py output_v5/perplex.db output_v5
 bash 04-run_mlp_ae.sh output_v5 DECODA_list_wid.shelve 
@@ -39,4 +39,8 @@
                 raise
 def select(elm):
     return int(elm.split("_")[-1])
+
+
+def select_mmf(elm):
+    return int(elm.split("_")[0])
+'''This script demonstrates how to build a variational autoencoder with Keras.
+Reference: "Auto-Encoding Variational Bayes" https://arxiv.org/abs/1312.6114
+'''
+
+import itertools
+import sys
+import json
+
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy import sparse
+import scipy.io
+
+from keras.layers import Input, Dense, Lambda
+from keras.models import Model
+from keras import backend as K
+from keras import objectives
+from keras.datasets import mnist
+
+import pandas
+import shelve
+import pickle
+
+
+
+
+
+#batch_size = 16
+#original_dim = 784
+#latent_dim = 2
+#intermediate_dim = 128
+#epsilon_std = 0.01
+#nb_epoch = 40
+
+
+
+
+def train_vae(x_train,x_dev,x_test,y_train=None,y_dev=None,y_test=None,hidden_size=80,latent_dim=12,batch_size=8,nb_epochs=10,sgd="rmsprop",input_activation = "relu",output_activation = "sigmoid",epsilon_std=0.01):
+
+
+
+    def sampling(args):
+        z_mean, z_log_std = args
+        epsilon = K.random_normal(shape=(batch_size, latent_dim),
+                                  mean=0., std=epsilon_std)
+        return z_mean + K.exp(z_log_std) * epsilon
+
+    def vae_loss(x, x_decoded_mean):
+        xent_loss = objectives.binary_crossentropy(x, x_decoded_mean)
+        kl_loss = - 0.5 * K.mean(1 + z_log_std - K.square(z_mean) - K.exp(z_log_std), axis=-1)
+        return xent_loss + kl_loss
+
+    original_dim = x_train.shape[1]
+
+
+    x = Input(batch_shape=(batch_size, original_dim))
+    h = Dense(hidden_size, activation=input_activation)(x)
+    z_mean = Dense(latent_dim)(h)
+    z_log_std = Dense(latent_dim)(h)
+
+
+    # note that "output_shape" isn't necessary with the TensorFlow backend
+    # so you could write `Lambda(sampling)([z_mean, z_log_std])`
+    z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_std])
+
+    # we instantiate these layers separately so as to reuse them later
+    decoder_h = Dense(hidden_size, activation=input_activation)
+    decoder_mean = Dense(original_dim, activation=output_activation)
+    h_decoded = decoder_h(z)
+    x_decoded_mean = decoder_mean(h_decoded)
+
+
+    vae = Model(x, x_decoded_mean)
+    vae.compile(optimizer=sgd, loss=vae_loss)
+
+    # train the VAE on MNIST digits
+    if y_train is None or y_dev is None or y_test is None :
+        y_train = x_train
+        y_dev = x_dev
+        y_test = x_test
+
+    vae.fit(x_train, y_train,
+            shuffle=True,
+            nb_epoch=nb_epochs,
+            batch_size=batch_size,
+            validation_data=(x_dev, y_dev))
+
+    # build a model to project inputs on the latent space
+    encoder = Model(x, z_mean)
+    pred_train = encoder.predict(x_train, batch_size=batch_size)
+    pred_dev = encoder.predict(x_dev, batch_size=batch_size)
+    pred_test = encoder.predict(x_test,batch_size=batch_size)
+    return [ [ pred_train, pred_dev, pred_test ] ]
+# display a 2D plot of the digit classes in the latent space
+    #x_test_encoded = encoder.predict(x_test, batch_size=batch_size)
+    # build a digit generator that can sample from the learned distribution
+    #decoder_input = Input(shape=(latent_dim,))
+    #_h_decoded = decoder_h(decoder_input)
+    #_x_decoded_mean = decoder_mean(_h_decoded)
+    #generator = Model(decoder_input, _x_decoded_mean)
+    #x_decoded = generator.predict(z_sample)
+
	1	+import sys
	2	+import os
	3	+
	4	+import pandas
	5	+import numpy
	6	+import shelve
	7	+
	8	+from sklearn.preprocessing import LabelBinarizer
	9	+
	10	+from utils import select_mmf as select
	11	+
	12	+input_dir = sys.argv[1] # Dossier de premire niveau contient ASR et TRS
	13	+level = sys.argv[2] # taille de LDA ( -5) voulu
	14	+
	15	+lb=LabelBinarizer()
	16	+#y_train=lb.fit_transform([utils.select(ligneid) for ligneid in origin_corps["LABEL"]["TRAIN"]])
	17	+
	18	+
	19	+data = shelve.open("{}/mmf_{}.shelve".format(input_dir,level))
	20	+data["LABEL"]= {"LDA":{}}
	21	+for mod in ["ASR", "TRS" ]
	22	+ train = pandas.read_table("{}/{}/train_{}.ssv".format(input_dir, mod, level), sep=" ", header=None )
	23	+ dev = pandas.read_table("{}/{}/dev_{}.ssv".format(input_dir, mod, level), sep=" ", header=None )
	24	+ test = pandas.read_table("{}/{}/test_{}.ssv".format(input_dir, mod, level), sep=" ", header=None )
	25	+
	26	+ y_train = train.iloc[:,0].apply(select)
	27	+ y_dev = dev.iloc[:,0].apply(select)
	28	+ y_test = test.iloc[:,0].apply(select)
	29	+ lb.fit(y_train)
	30	+ data["LABEL"][mod]={"TRAIN":lb.transform(y_train),"DEV":lb.transform(y_dev), "TEST": lb.transform(y_test)}
	31	+
	32	+ data["LDA"][mod]={}
	33	+ data["LDA"][mod]["TRAIN"]=train.iloc[:,1:].values
	34	+ data["LDA"][mod]["DEV"]=dev.iloc[:,1:].values
	35	+ data["LDA"][mod]["TEST"]=test.iloc[:,1:].values
	36	+
	37	+data.sync()
	38	+data.close()
...	...	@@ -12,10 +12,11 @@
12	12	import dill
13	13	from tinydb import TinyDB, where, Query
14	14	import time
	15	+from joblib import Parallel, delayed
15	16
16	17	def calc_perp(models,train):
17	18
18		-
	19	+
19	20	stop_words=models[1]
20	21	name = models[0]
21	22
...	...	@@ -45,7 +46,8 @@
45	46	def train_lda(out_dir,train,size,it,sw_size,alpha,eta,passes,chunk):
46	47	name = "s{}_it{}_sw{}_a{}_e{}_p{}_c{}".format(size,it,sw_size,alpha,eta,passes,chunk)
47	48	logging.warning(name)
48		- if os.path.isfile(out_dir+"/"+name+".dill"):
	49	+ deep_out_dir = out_dir+"/"+name
	50	+ if os.path.isdir(deep_out_dir):
49	51	logging.error(name+" already done")
50	52	return
51	53	logging.warning(name+" to be done")
...	...	@@ -54,7 +56,6 @@
54	56	asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ]
55	57	trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ]
56	58	stop_words=set(asr_sw) \| set(trs_sw)
57		- stop_words=[ x.strip() for x in open("french.txt").readlines() ]
58	59
59	60	logging.warning("TRS to be done")
60	61
61	62
62	63
63	64
...	...	@@ -68,19 +69,42 @@
68	69	asr_probs = []
69	70	for line in lda_asr.expElogbeta:
70	71	nline = line / np.sum(line)
71		- asr_probs.append( str(x) for x in nline)
	72	+ asr_probs.append([ str(x) for x in nline])
72	73	trs_probs = []
73	74	for line in lda_trs.expElogbeta:
74	75	nline = line / np.sum(line)
75		- trs_probs.append( str(x) for x in nline)
	76	+ trs_probs.append([str(x) for x in nline])
76	77
77	78	K = lda_asr.num_topics
78	79	topicWordProbMat_asr = lda_asr.print_topics(K,10)
79	80
80	81	K = lda_trs.num_topics
81	82	topicWordProbMat_trs = lda_trs.print_topics(K,10)
	83	+ os.mkdir(deep_out_dir)
	84	+ dill.dump([x for x in stop_words],open(deep_out_dir+"/stopwords.dill","w"))
	85	+ lda_asr.save(deep_out_dir+"/lda_asr.model")
	86	+ lda_trs.save(deep_out_dir+"/lda_trs.model")
	87	+ dill.dump([x for x in asr_probs],open(deep_out_dir+"/lda_asr_probs.dill","w"))
	88	+ dill.dump([x for x in trs_probs],open(deep_out_dir+"/lda_trs_probs.dill","w"))
	89	+
82	90	return [name, stop_words, lda_asr , asr_probs , topicWordProbMat_asr, lda_trs, trs_probs, topicWordProbMat_trs]
83	91
	92	+def train_one(name,train,s,i,sw,a,e,p,c):
	93	+ st=time.time()
	94	+ logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]]))
	95	+ models = train_lda(name,train,s,i,sw,a,e,p,c)
	96	+ if models:
	97	+ m = calc_perp(models,train)
	98	+ #dill.dump(models,open("{}/{}.dill".format(name,models[0]),"wb"))
	99	+ else :
	100	+ m = None
	101	+ e = time.time()
	102	+ logging.warning("fin en : {}".format(e-st))
	103	+ return m
	104	+
	105	+
	106	+
	107	+
84	108	if __name__ == "__main__":
85	109	logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
86	110
...	...	@@ -109,6 +133,8 @@
109	133	db = TinyDB(db_path)
110	134	nb_model = len(passes) * len(chunk) * len(it) * len(sw_size) * len(alpha) * len(eta) * len(size)
111	135	logging.warning(" hey will train {} models ".format(nb_model))
	136	+
	137	+ args_list=[]
112	138	for p in passes:
113	139	for c in chunk:
114	140	for i in it :
...	...	@@ -116,13 +142,9 @@
116	142	for a in alpha:
117	143	for e in eta:
118	144	for s in size:
119		- st=time.time()
120		- logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]]))
121		- models = train_lda(name,train,s,i,sw,a,e,p,c)
122		- if models:
123		- m = calc_perp(models,train)
124		- dill.dump(models,open("{}/{}.dill".format(name,models[0]),"wb"))
125		- db.insert(m)
126		- e = time.time()
127		- logging.warning("fin en : {}".format(e-st))
	145	+ args_list.append((name,train,s,i,sw,a,e,p,c))
	146	+ res_list= Parallel(n_jobs=15)(delayed(train_one)(*args) for args in args_list)
	147	+ for m in res_list :
	148	+ db.insert(m)
	149	+
...	...	@@ -52,7 +52,7 @@
52	52	input_dir = sys.argv[2]
53	53	db_path = sys.argv[3]
54	54	logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
55		- folders = glob.glob("{}/*".format(input_dir))
	55	+ folders = glob.glob("{}/s*".format(input_dir))
56	56
57	57	#train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir)))
58	58	train = shelve.open(input_shelve)
...	...	@@ -22,40 +22,43 @@
22	22
23	23
24	24	def calc_perp(params):
25		- in_dir,train = params
26		- name = in_dir.split("/")[-1]
27		- # s40_it1_sw50_a0.01_e0.1_p6_c1000
	25	+ try:
	26	+ in_dir,train = params
	27	+ name = in_dir.split("/")[-1]
	28	+ # s40_it1_sw50_a0.01_e0.1_p6_c1000
28	29
29		- entry = Query()
30		- value=db.search(entry.name == name)
31		- if len(value) > 0 :
32		- logging.warning("{} already done".format(name))
33		- return
	30	+ entry = Query()
	31	+ value=db.search(entry.name == name)
	32	+ if len(value) > 0 :
	33	+ logging.warning("{} already done".format(name))
	34	+ return
34	35
35		- sw_size = int(name.split("_")[2][2:])
	36	+ sw_size = int(name.split("_")[2][2:])
36	37
37		- logging.warning(" go {} ".format(name))
	38	+ logging.warning(" go {} ".format(name))
38	39
39	40
40		- logging.warning("Redo Vocab and stop")
41		- asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y])
42		- trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y])
43		- asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ]
44		- trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ]
45		- stop_words=set(asr_sw) \| set(trs_sw)
	41	+ logging.warning("Redo Vocab and stop")
	42	+ asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y])
	43	+ trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y])
	44	+ asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ]
	45	+ trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ]
	46	+ stop_words=set(asr_sw) \| set(trs_sw)
46	47
47		- logging.warning("TRS to be done")
48		-
49		- dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]]
50		- lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir))
51		- perp_trs = lda_trs.log_perplexity(dev_trs)
52		- logging.warning("ASR to be done")
53		- dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]]
54		- lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir))
55		- perp_asr = lda_asr.log_perplexity(dev_asr)
56		- logging.warning("ASR saving")
57		- res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs}
58		- return res_dict
	48	+ logging.warning("TRS to be done")
	49	+
	50	+ dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]]
	51	+ lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir))
	52	+ perp_trs = lda_trs.log_perplexity(dev_trs)
	53	+ logging.warning("ASR to be done")
	54	+ dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]]
	55	+ lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir))
	56	+ perp_asr = lda_asr.log_perplexity(dev_asr)
	57	+ logging.warning("ASR saving")
	58	+ res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs}
	59	+ return res_dict
	60	+ except :
	61	+ return { "name" : name }
59	62
60	63	if __name__ == "__main__":
61	64	input_shelve = sys.argv[1]
	1	+
	2	+# coding: utf-8
	3	+
	4	+# In[29]:
	5	+
	6	+# Import
	7	+import itertools
	8	+import shelve
	9	+import pickle
	10	+import numpy
	11	+import scipy
	12	+from scipy import sparse
	13	+import scipy.sparse
	14	+import scipy.io
	15	+from mlp import *
	16	+import mlp
	17	+import sys
	18	+import utils
	19	+import dill
	20	+from collections import Counter
	21	+from gensim.models import LdaModel
	22	+
	23	+
	24	+
	25	+# In[3]:
	26	+
	27	+#30_50_50_150_0.0001
	28	+
	29	+# In[4]:
	30	+
	31	+#db=shelve.open("SPELIKE_MLP_DB.shelve",writeback=True)
	32	+origin_corps=shelve.open("{}".format(sys.argv[2]))
	33	+in_dir = sys.argv[1]
	34	+
	35	+
	36	+out_db=shelve.open("{}/mlp_scores.shelve".format(in_dir),writeback=True)
	37	+
	38	+mlp_h = [ 250, 250 ]
	39	+mlp_loss = "categorical_crossentropy"
	40	+mlp_dropouts = [0.25]* len(mlp_h)
	41	+mlp_sgd = Adam(lr=0.0001)
	42	+mlp_epochs = 3000
	43	+mlp_batch_size = 1
	44	+mlp_input_activation = "relu"
	45	+mlp_output_activation="softmax"
	46	+
	47	+ress = []
	48	+for key in ["TRS", "ASR"] :
	49	+
	50	+ res=mlp.train_mlp(origin_corps["LDA"][key]["TRAIN"],origin_corps["LABEL"][key]["TRAIN"],
	51	+ origin_corps["LDA"][key]["DEV"],origin_corps["LABEL"][key]["DEV"],
	52	+ origin_corps["LDA"][key]["TEST"],origin_corps["LABEL"][key]["TEST"],
	53	+ mlp_h,dropouts=mlp_dropouts,sgd=mlp_sgd,
	54	+ epochs=mlp_epochs,
	55	+ batch_size=mlp_batch_size,
	56	+ save_pred=False,keep_histo=False,
	57	+ loss="categorical_crossentropy",fit_verbose=0)
	58	+ arg_best=[]
	59	+ dev_best=[]
	60	+ arg_best.append(numpy.argmax(res[1]))
	61	+ dev_best.append(res[1][arg_best[-1]])
	62	+ res[1][arg_best[-1]]=0
	63	+ arg_best.append(numpy.argmax(res[1]))
	64	+ dev_best.append(res[1][arg_best[-1]])
	65	+ res[1][arg_best[-1]]=0
	66	+ arg_best.append(numpy.argmax(res[1]))
	67	+ dev_best.append(res[1][arg_best[-1]])
	68	+ res[1][arg_best[-1]]=0
	69	+ arg_best.append(numpy.argmax(res[1]))
	70	+ dev_best.append(res[1][arg_best[-1]])
	71	+ res[1][arg_best[-1]]=0
	72	+ arg_best.append(numpy.argmax(res[1]))
	73	+ dev_best.append(res[1][arg_best[-1]])
	74	+ res[1][arg_best[-1]]=0
	75	+ arg_best.append(numpy.argmax(res[1]))
	76	+ dev_best.append(res[1][arg_best[-1]])
	77	+ res[1][arg_best[-1]]=0
	78	+ arg_best.append(numpy.argmax(res[1]))
	79	+ dev_best.append(res[1][arg_best[-1]])
	80	+ res[1][arg_best[-1]]=0
	81	+ arg_best.append(numpy.argmax(res[1]))
	82	+ dev_best.append(res[1][arg_best[-1]])
	83	+ res[1][arg_best[-1]]=0
	84	+ arg_best.append(numpy.argmax(res[1]))
	85	+ dev_best.append(res[1][arg_best[-1]])
	86	+ res[1][arg_best[-1]]=0
	87	+ arg_best.append(numpy.argmax(res[1]))
	88	+ dev_best.append(res[1][arg_best[-1]])
	89	+ res[1][arg_best[-1]]=0
	90	+ arg_best.append(numpy.argmax(res[1]))
	91	+ dev_best.append(res[1][arg_best[-1]])
	92	+ res[1][arg_best[-1]]=0
	93	+ arg_best.append(numpy.argmax(res[1]))
	94	+ dev_best.append(res[1][arg_best[-1]])
	95	+ res[1][arg_best[-1]]=0
	96	+
	97	+
	98	+
	99	+
	100	+ test_best =[ res[2][x] for x in arg_best ]
	101	+ test_max = numpy.max(res[2])
	102	+ out_db[key]=(res,(dev_best,test_best,test_max))
	103	+ ress.append((key,dev_best,test_best,test_max))
	104	+
	105	+for el in ress :
	106	+ print el
	107	+out_db.close()
	108	+origin_corps.close()
	1	+
	2	+# coding: utf-8
	3	+
	4	+# In[2]:
	5	+
	6	+# Import
	7	+import gensim
	8	+from scipy import sparse
	9	+import itertools
	10	+from sklearn import preprocessing
	11	+from keras.models import Sequential
	12	+from keras.optimizers import SGD,Adam
	13	+from mlp import *
	14	+import sklearn.metrics
	15	+import shelve
	16	+import pickle
	17	+from utils import *
	18	+import sys
	19	+import os
	20	+import json
	21	+# In[4]:
	22	+
	23	+infer_model=shelve.open("{}".format(sys.argv[2]))
	24	+in_dir = sys.argv[1]
	25	+#['ASR', 'TRS', 'LABEL']
	26	+# In[6]:
	27	+
	28	+
	29	+hidden_size=[ 100 , 50, 100 ]
	30	+input_activation="tanh"
	31	+output_activation="tanh"
	32	+loss="mse"
	33	+epochs=1000
	34	+batch=1
	35	+patience=60
	36	+do_do=[False]
	37	+sgd = Adam(lr=0.000001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True)
	38	+
	39	+
	40	+
	41	+mlp_h = [ 150 ,150 ,150 ]
	42	+mlp_loss = "categorical_crossentropy"
	43	+mlp_dropouts = []
	44	+mlp_sgd = Adam(lr=0.0001)
	45	+mlp_epochs = 2000
	46	+mlp_batch_size = 8
	47	+mlp_output_activation="softmax"
	48	+
	49	+try :
	50	+ sgd_repr=sgd.get_config()["name"]
	51	+except AttributeError :
	52	+ sgd_repr=sgd
	53	+
	54	+try :
	55	+ mlp_sgd_repr=mlp_sgd.get_config()["name"]
	56	+except AttributeError :
	57	+ mlp_sgd_repr=mlp_sgd
	58	+
	59	+
	60	+params={ "h1" : "_".join([ str(x) for x in hidden_size ]),
	61	+ "inside_activation" : input_activation,
	62	+ "output_activation" : output_activation,
	63	+ "do_dropout": "_".join([str(x) for x in do_do]),
	64	+ "loss" : loss,
	65	+ "epochs" : epochs ,
	66	+ "batch_size" : batch,
	67	+ "patience" : patience,
	68	+ "sgd" : sgd_repr,
	69	+ "mlp_h ": "_".join([str(x) for x in mlp_h]),
	70	+ "mlp_loss ": mlp_loss,
	71	+ "mlp_dropouts ": "_".join([str(x) for x in mlp_dropouts]),
	72	+ "mlp_sgd ": mlp_sgd_repr,
	73	+ "mlp_epochs ": mlp_epochs,
	74	+ "mlp_batch_size ": mlp_batch_size,
	75	+ "mlp_output" : mlp_output_activation
	76	+ }
	77	+name = "_".join([ str(x) for x in params.values()])
	78	+try:
	79	+ os.mkdir("{}/{}".format(in_dir,name))
	80	+except:
	81	+ pass
	82	+db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True)
	83	+db["params"] = params
	84	+db["LABEL"]=infer_model["LABEL"]
	85	+#
	86	+json.dump(params,
	87	+ open("{}/{}/ae_model.json".format(in_dir,name),"w"),
	88	+ indent=4)
	89	+
	90	+keys = ["ASR","TRS"]
	91	+
	92	+db["AE"] = {}
	93	+db["LDA"] = {}
	94	+for mod in keys :
	95	+ print mod
	96	+ db["LDA"][mod] = train_mlp(infer_model["LDA"][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"],
	97	+ infer_model["LDA"][mod]["DEV"],infer_model["LABEL"][mod]["DEV"],
	98	+ infer_model["LDA"][mod]["TEST"],infer_model["LABEL"][mod]["TEST"],
	99	+ mlp_h ,sgd=mlp_sgd,
	100	+ epochs=mlp_epochs,
	101	+ batch_size=mlp_batch_size,
	102	+ input_activation=input_activation,
	103	+ output_activation=mlp_output_activation,
	104	+ dropouts=mlp_dropouts,
	105	+ fit_verbose=0)
	106	+
	107	+ res=train_ae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"],
	108	+ hidden_size,patience = params["patience"],sgd=sgd,
	109	+ dropouts=do_do,input_activation=input_activation,output_activation=output_activation,
	110	+ loss=loss,epochs=epochs,batch_size=batch,verbose=0)
	111	+ mlp_res_list=[]
	112	+ for layer in res :
	113	+ mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"],
	114	+ layer[1],infer_model["LABEL"][mod]["DEV"],
	115	+ layer[2],infer_model["LABEL"][mod]["TEST"],
	116	+ mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,
	117	+ output_activation=mlp_output_activation,
	118	+ input_activation=input_activation,
	119	+ batch_size=mlp_batch_size,fit_verbose=0))
	120	+ db["AE"][mod]=mlp_res_list
	121	+
	122	+mod = "ASR"
	123	+mod2= "TRS"
	124	+mlp_res_list=[]
	125	+
	126	+res = train_ae(infer_model["LDA"][mod]["TRAIN"],
	127	+ infer_model["LDA"][mod]["DEV"],
	128	+ infer_model["LDA"][mod]["TEST"],
	129	+ hidden_size,dropouts=do_do,patience = params["patience"],
	130	+ sgd=sgd,input_activation=input_activation,output_activation=output_activation,loss=loss,epochs=epochs,
	131	+ batch_size=batch,
	132	+ y_train=infer_model["LDA"][mod]["TRAIN"],
	133	+ y_dev=infer_model["LDA"][mod2]["DEV"],
	134	+ y_test=infer_model["LDA"][mod2]["TEST"])
	135	+
	136	+for layer in res :
	137	+ mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"],
	138	+ layer[1],infer_model["LABEL"][mod]["DEV"],
	139	+ layer[2],infer_model["LABEL"][mod]["TEST"],
	140	+ mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,
	141	+ output_activation=mlp_output_activation,
	142	+ input_activation=input_activation,
	143	+ batch_size=mlp_batch_size,fit_verbose=0))
	144	+
	145	+db["AE"]["SPE"] = mlp_res_list
	146	+
	147	+db.sync()
	148	+db.close()
	1	+import numpy as np
	2	+import shelve
	3	+import sys
	4	+import glob
	5	+from collections import defaultdict
	6	+from tinydb import TinyDB, Query
	7	+from mako.template import Template
	8	+import time
	9	+
	10	+def get_best(x):
	11	+ argbest=np.argmax(x[1])
	12	+ maxdev=x[1][argbest]
	13	+ maxtrain=np.max(x[0])
	14	+ maxtest=np.max(x[2])
	15	+ besttest=x[2][argbest]
	16	+ return ( maxtrain,maxdev,maxtest,besttest)
	17	+depth = lambda L: isinstance(L, list) and max(map(depth, L))+1
	18	+
	19	+
	20	+template_name = '''
	21	+${name}
	22	+========================
	23	+
	24	+MLP scores :
	25	+-------------------
	26	+'''
	27	+template_value='''\n\n
	28	+\| ${model} ${ttype} \| train \| dev \|max test\| best test\|
	29	+\| -------------------:\|:--------:\|:---------:\|:------:\|:--------:\|
	30	+% for cpt,line in enumerate(models[model][ttype]):
	31	+\| ${cpt} \| ${line[0]} \| ${line[1]} \|${line[2]} \| ${line[3]} \|
	32	+% endfor
	33	+\n
	34	+'''
	35	+
	36	+# ae_model.shelve
	37	+def get_folder_file(x):
	38	+ folder=x.split("/")[1]
	39	+ shelve_file = ".".join(x.split(".")[:-1])
	40	+ return(folder,shelve_file)
	41	+
	42	+in_folder = sys.argv[1]
	43	+
	44	+
	45	+models = defaultdict(dict)
	46	+
	47	+ae_model_list = glob.glob("{}/*/ae_model.shelve.dir".format(in_folder))
	48	+ae_model_list = sorted(ae_model_list)
	49	+ae_model_list= map(get_folder_file,ae_model_list)
	50	+for name , shelve_file in ae_model_list :
	51	+ print Template(template_name).render(name=name)
	52	+ opened_shelve = shelve.open(shelve_file)
	53	+ keys = opened_shelve.keys()
	54	+ if "LABEL" in keys :
	55	+ keys.remove("LABEL")
	56	+ if "params" in keys:
	57	+ keys.remove("params")
	58	+ to_print = []
	59	+ for working_key in keys:
	60	+ for key in opened_shelve[working_key].keys():
	61	+ table_depth = depth(opened_shelve[working_key][key])
	62	+ if table_depth == 3 :
	63	+ models[working_key][key] = [ get_best(x) for x in opened_shelve[working_key][key] ]
	64	+ to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip())
	65	+ elif table_depth == 2 :
	66	+ models[working_key][key] = [ get_best(opened_shelve[working_key][key]) ]
	67	+ to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip())
	68	+ elif table_depth == 4 :
	69	+ for layer in opened_shelve[working_key][key] :
	70	+ models[working_key][key] = [ get_best(x) for x in layer ]
	71	+ to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip())
	72	+ print "\n".join(to_print)
1	1	#python 00-prepross.py
2		-python 02-lda-order.py DECODA_list_wid.shelve output_v5/perplex.db 50 10 output_v5 50_500 0 1_0.1 1_0.1 500_1000 100_2000
	2	+python 02-lda.py DECODA_list_wid.shelve output_v5/perplex.db 50 10 output_v5 50_500 0 1_0.1 1_0.1 500_1000 100_2000
3	3	#python 03-perplex.py DECODA_list_wid.shelve output_v5 output_v5/perplex.db
4	4	python 03-order_by_perp.py output_v5/perplex.db output_v5
5	5	bash 04-run_mlp_ae.sh output_v5 DECODA_list_wid.shelve
...	...	@@ -39,4 +39,8 @@
39	39	raise
40	40	def select(elm):
41	41	return int(elm.split("_")[-1])
	42	+
	43	+
	44	+def select_mmf(elm):
	45	+ return int(elm.split("_")[0])
	1	+'''This script demonstrates how to build a variational autoencoder with Keras.
	2	+Reference: "Auto-Encoding Variational Bayes" https://arxiv.org/abs/1312.6114
	3	+'''
	4	+
	5	+import itertools
	6	+import sys
	7	+import json
	8	+
	9	+import numpy as np
	10	+import matplotlib.pyplot as plt
	11	+from scipy import sparse
	12	+import scipy.io
	13	+
	14	+from keras.layers import Input, Dense, Lambda
	15	+from keras.models import Model
	16	+from keras import backend as K
	17	+from keras import objectives
	18	+from keras.datasets import mnist
	19	+
	20	+import pandas
	21	+import shelve
	22	+import pickle
	23	+
	24	+
	25	+
	26	+
	27	+
	28	+#batch_size = 16
	29	+#original_dim = 784
	30	+#latent_dim = 2
	31	+#intermediate_dim = 128
	32	+#epsilon_std = 0.01
	33	+#nb_epoch = 40
	34	+
	35	+
	36	+
	37	+
	38	+def train_vae(x_train,x_dev,x_test,y_train=None,y_dev=None,y_test=None,hidden_size=80,latent_dim=12,batch_size=8,nb_epochs=10,sgd="rmsprop",input_activation = "relu",output_activation = "sigmoid",epsilon_std=0.01):
	39	+
	40	+
	41	+
	42	+ def sampling(args):
	43	+ z_mean, z_log_std = args
	44	+ epsilon = K.random_normal(shape=(batch_size, latent_dim),
	45	+ mean=0., std=epsilon_std)
	46	+ return z_mean + K.exp(z_log_std) * epsilon
	47	+
	48	+ def vae_loss(x, x_decoded_mean):
	49	+ xent_loss = objectives.binary_crossentropy(x, x_decoded_mean)
	50	+ kl_loss = - 0.5 * K.mean(1 + z_log_std - K.square(z_mean) - K.exp(z_log_std), axis=-1)
	51	+ return xent_loss + kl_loss
	52	+
	53	+ original_dim = x_train.shape[1]
	54	+
	55	+
	56	+ x = Input(batch_shape=(batch_size, original_dim))
	57	+ h = Dense(hidden_size, activation=input_activation)(x)
	58	+ z_mean = Dense(latent_dim)(h)
	59	+ z_log_std = Dense(latent_dim)(h)
	60	+
	61	+
	62	+ # note that "output_shape" isn't necessary with the TensorFlow backend
	63	+ # so you could write `Lambda(sampling)([z_mean, z_log_std])`
	64	+ z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_std])
	65	+
	66	+ # we instantiate these layers separately so as to reuse them later
	67	+ decoder_h = Dense(hidden_size, activation=input_activation)
	68	+ decoder_mean = Dense(original_dim, activation=output_activation)
	69	+ h_decoded = decoder_h(z)
	70	+ x_decoded_mean = decoder_mean(h_decoded)
	71	+
	72	+
	73	+ vae = Model(x, x_decoded_mean)
	74	+ vae.compile(optimizer=sgd, loss=vae_loss)
	75	+
	76	+ # train the VAE on MNIST digits
	77	+ if y_train is None or y_dev is None or y_test is None :
	78	+ y_train = x_train
	79	+ y_dev = x_dev
	80	+ y_test = x_test
	81	+
	82	+ vae.fit(x_train, y_train,
	83	+ shuffle=True,
	84	+ nb_epoch=nb_epochs,
	85	+ batch_size=batch_size,
	86	+ validation_data=(x_dev, y_dev))
	87	+
	88	+ # build a model to project inputs on the latent space
	89	+ encoder = Model(x, z_mean)
	90	+ pred_train = encoder.predict(x_train, batch_size=batch_size)
	91	+ pred_dev = encoder.predict(x_dev, batch_size=batch_size)
	92	+ pred_test = encoder.predict(x_test,batch_size=batch_size)
	93	+ return [ [ pred_train, pred_dev, pred_test ] ]
	94	+# display a 2D plot of the digit classes in the latent space
	95	+ #x_test_encoded = encoder.predict(x_test, batch_size=batch_size)
	96	+ # build a digit generator that can sample from the learned distribution
	97	+ #decoder_input = Input(shape=(latent_dim,))
	98	+ #_h_decoded = decoder_h(decoder_input)
	99	+ #_x_decoded_mean = decoder_mean(_h_decoded)
	100	+ #generator = Model(decoder_input, _x_decoded_mean)
	101	+ #x_decoded = generator.predict(z_sample)
	102	+