Commit 7db73861ffbab3f3f51b17188d8894a512b36264

Authored by Killian
1 parent b6d0165d16
Exists in master

add vae et mmf

Showing 13 changed files with 1084 additions and 44 deletions Side-by-side Diff

LDA/00-mmf_make_features.py
  1 +import sys
  2 +import os
  3 +
  4 +import pandas
  5 +import numpy
  6 +import shelve
  7 +
  8 +from sklearn.preprocessing import LabelBinarizer
  9 +
  10 +from utils import select_mmf as select
  11 +
  12 +input_dir = sys.argv[1] # Dossier de premire niveau contient ASR et TRS
  13 +level = sys.argv[2] # taille de LDA ( -5) voulu
  14 +
  15 +lb=LabelBinarizer()
  16 +#y_train=lb.fit_transform([utils.select(ligneid) for ligneid in origin_corps["LABEL"]["TRAIN"]])
  17 +
  18 +
  19 +data = shelve.open("{}/mmf_{}.shelve".format(input_dir,level))
  20 +data["LABEL"]= {"LDA":{}}
  21 +for mod in ["ASR", "TRS" ]
  22 + train = pandas.read_table("{}/{}/train_{}.ssv".format(input_dir, mod, level), sep=" ", header=None )
  23 + dev = pandas.read_table("{}/{}/dev_{}.ssv".format(input_dir, mod, level), sep=" ", header=None )
  24 + test = pandas.read_table("{}/{}/test_{}.ssv".format(input_dir, mod, level), sep=" ", header=None )
  25 +
  26 + y_train = train.iloc[:,0].apply(select)
  27 + y_dev = dev.iloc[:,0].apply(select)
  28 + y_test = test.iloc[:,0].apply(select)
  29 + lb.fit(y_train)
  30 + data["LABEL"][mod]={"TRAIN":lb.transform(y_train),"DEV":lb.transform(y_dev), "TEST": lb.transform(y_test)}
  31 +
  32 + data["LDA"][mod]={}
  33 + data["LDA"][mod]["TRAIN"]=train.iloc[:,1:].values
  34 + data["LDA"][mod]["DEV"]=dev.iloc[:,1:].values
  35 + data["LDA"][mod]["TEST"]=test.iloc[:,1:].values
  36 +
  37 +data.sync()
  38 +data.close()
... ... @@ -12,10 +12,11 @@
12 12 import dill
13 13 from tinydb import TinyDB, where, Query
14 14 import time
  15 +from joblib import Parallel, delayed
15 16  
16 17 def calc_perp(models,train):
17 18  
18   -
  19 +
19 20 stop_words=models[1]
20 21 name = models[0]
21 22  
... ... @@ -45,7 +46,8 @@
45 46 def train_lda(out_dir,train,size,it,sw_size,alpha,eta,passes,chunk):
46 47 name = "s{}_it{}_sw{}_a{}_e{}_p{}_c{}".format(size,it,sw_size,alpha,eta,passes,chunk)
47 48 logging.warning(name)
48   - if os.path.isfile(out_dir+"/"+name+".dill"):
  49 + deep_out_dir = out_dir+"/"+name
  50 + if os.path.isdir(deep_out_dir):
49 51 logging.error(name+" already done")
50 52 return
51 53 logging.warning(name+" to be done")
... ... @@ -54,7 +56,6 @@
54 56 asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ]
55 57 trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ]
56 58 stop_words=set(asr_sw) | set(trs_sw)
57   - stop_words=[ x.strip() for x in open("french.txt").readlines() ]
58 59  
59 60 logging.warning("TRS to be done")
60 61  
61 62  
62 63  
63 64  
... ... @@ -68,19 +69,42 @@
68 69 asr_probs = []
69 70 for line in lda_asr.expElogbeta:
70 71 nline = line / np.sum(line)
71   - asr_probs.append( str(x) for x in nline)
  72 + asr_probs.append([ str(x) for x in nline])
72 73 trs_probs = []
73 74 for line in lda_trs.expElogbeta:
74 75 nline = line / np.sum(line)
75   - trs_probs.append( str(x) for x in nline)
  76 + trs_probs.append([str(x) for x in nline])
76 77  
77 78 K = lda_asr.num_topics
78 79 topicWordProbMat_asr = lda_asr.print_topics(K,10)
79 80  
80 81 K = lda_trs.num_topics
81 82 topicWordProbMat_trs = lda_trs.print_topics(K,10)
  83 + os.mkdir(deep_out_dir)
  84 + dill.dump([x for x in stop_words],open(deep_out_dir+"/stopwords.dill","w"))
  85 + lda_asr.save(deep_out_dir+"/lda_asr.model")
  86 + lda_trs.save(deep_out_dir+"/lda_trs.model")
  87 + dill.dump([x for x in asr_probs],open(deep_out_dir+"/lda_asr_probs.dill","w"))
  88 + dill.dump([x for x in trs_probs],open(deep_out_dir+"/lda_trs_probs.dill","w"))
  89 +
82 90 return [name, stop_words, lda_asr , asr_probs , topicWordProbMat_asr, lda_trs, trs_probs, topicWordProbMat_trs]
83 91  
  92 +def train_one(name,train,s,i,sw,a,e,p,c):
  93 + st=time.time()
  94 + logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]]))
  95 + models = train_lda(name,train,s,i,sw,a,e,p,c)
  96 + if models:
  97 + m = calc_perp(models,train)
  98 + #dill.dump(models,open("{}/{}.dill".format(name,models[0]),"wb"))
  99 + else :
  100 + m = None
  101 + e = time.time()
  102 + logging.warning("fin en : {}".format(e-st))
  103 + return m
  104 +
  105 +
  106 +
  107 +
84 108 if __name__ == "__main__":
85 109 logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
86 110  
... ... @@ -109,6 +133,8 @@
109 133 db = TinyDB(db_path)
110 134 nb_model = len(passes) * len(chunk) * len(it) * len(sw_size) * len(alpha) * len(eta) * len(size)
111 135 logging.warning(" hey will train {} models ".format(nb_model))
  136 +
  137 + args_list=[]
112 138 for p in passes:
113 139 for c in chunk:
114 140 for i in it :
... ... @@ -116,13 +142,9 @@
116 142 for a in alpha:
117 143 for e in eta:
118 144 for s in size:
119   - st=time.time()
120   - logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]]))
121   - models = train_lda(name,train,s,i,sw,a,e,p,c)
122   - if models:
123   - m = calc_perp(models,train)
124   - dill.dump(models,open("{}/{}.dill".format(name,models[0]),"wb"))
125   - db.insert(m)
126   - e = time.time()
127   - logging.warning("fin en : {}".format(e-st))
  145 + args_list.append((name,train,s,i,sw,a,e,p,c))
  146 + res_list= Parallel(n_jobs=15)(delayed(train_one)(*args) for args in args_list)
  147 + for m in res_list :
  148 + db.insert(m)
  149 +
LDA/03-mono_perplex.py
... ... @@ -52,7 +52,7 @@
52 52 input_dir = sys.argv[2]
53 53 db_path = sys.argv[3]
54 54 logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
55   - folders = glob.glob("{}/*".format(input_dir))
  55 + folders = glob.glob("{}/s*".format(input_dir))
56 56  
57 57 #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir)))
58 58 train = shelve.open(input_shelve)
... ... @@ -22,40 +22,43 @@
22 22  
23 23  
24 24 def calc_perp(params):
25   - in_dir,train = params
26   - name = in_dir.split("/")[-1]
27   - # s40_it1_sw50_a0.01_e0.1_p6_c1000
  25 + try:
  26 + in_dir,train = params
  27 + name = in_dir.split("/")[-1]
  28 + # s40_it1_sw50_a0.01_e0.1_p6_c1000
28 29  
29   - entry = Query()
30   - value=db.search(entry.name == name)
31   - if len(value) > 0 :
32   - logging.warning("{} already done".format(name))
33   - return
  30 + entry = Query()
  31 + value=db.search(entry.name == name)
  32 + if len(value) > 0 :
  33 + logging.warning("{} already done".format(name))
  34 + return
34 35  
35   - sw_size = int(name.split("_")[2][2:])
  36 + sw_size = int(name.split("_")[2][2:])
36 37  
37   - logging.warning(" go {} ".format(name))
  38 + logging.warning(" go {} ".format(name))
38 39  
39 40  
40   - logging.warning("Redo Vocab and stop")
41   - asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y])
42   - trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y])
43   - asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ]
44   - trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ]
45   - stop_words=set(asr_sw) | set(trs_sw)
  41 + logging.warning("Redo Vocab and stop")
  42 + asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y])
  43 + trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y])
  44 + asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ]
  45 + trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ]
  46 + stop_words=set(asr_sw) | set(trs_sw)
46 47  
47   - logging.warning("TRS to be done")
48   -
49   - dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]]
50   - lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir))
51   - perp_trs = lda_trs.log_perplexity(dev_trs)
52   - logging.warning("ASR to be done")
53   - dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]]
54   - lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir))
55   - perp_asr = lda_asr.log_perplexity(dev_asr)
56   - logging.warning("ASR saving")
57   - res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs}
58   - return res_dict
  48 + logging.warning("TRS to be done")
  49 +
  50 + dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]]
  51 + lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir))
  52 + perp_trs = lda_trs.log_perplexity(dev_trs)
  53 + logging.warning("ASR to be done")
  54 + dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]]
  55 + lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir))
  56 + perp_asr = lda_asr.log_perplexity(dev_asr)
  57 + logging.warning("ASR saving")
  58 + res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs}
  59 + return res_dict
  60 + except :
  61 + return { "name" : name }
59 62  
60 63 if __name__ == "__main__":
61 64 input_shelve = sys.argv[1]
  1 +
  2 +# coding: utf-8
  3 +
  4 +# In[29]:
  5 +
  6 +# Import
  7 +import itertools
  8 +import shelve
  9 +import pickle
  10 +import numpy
  11 +import scipy
  12 +from scipy import sparse
  13 +import scipy.sparse
  14 +import scipy.io
  15 +from mlp import *
  16 +import mlp
  17 +import sys
  18 +import utils
  19 +import dill
  20 +from collections import Counter
  21 +from gensim.models import LdaModel
  22 +
  23 +
  24 +
  25 +# In[3]:
  26 +
  27 +#30_50_50_150_0.0001
  28 +
  29 +# In[4]:
  30 +
  31 +#db=shelve.open("SPELIKE_MLP_DB.shelve",writeback=True)
  32 +origin_corps=shelve.open("{}".format(sys.argv[2]))
  33 +in_dir = sys.argv[1]
  34 +
  35 +
  36 +out_db=shelve.open("{}/mlp_scores.shelve".format(in_dir),writeback=True)
  37 +
  38 +mlp_h = [ 250, 250 ]
  39 +mlp_loss = "categorical_crossentropy"
  40 +mlp_dropouts = [0.25]* len(mlp_h)
  41 +mlp_sgd = Adam(lr=0.0001)
  42 +mlp_epochs = 3000
  43 +mlp_batch_size = 1
  44 +mlp_input_activation = "relu"
  45 +mlp_output_activation="softmax"
  46 +
  47 +ress = []
  48 +for key in ["TRS", "ASR"] :
  49 +
  50 + res=mlp.train_mlp(origin_corps["LDA"][key]["TRAIN"],origin_corps["LABEL"][key]["TRAIN"],
  51 + origin_corps["LDA"][key]["DEV"],origin_corps["LABEL"][key]["DEV"],
  52 + origin_corps["LDA"][key]["TEST"],origin_corps["LABEL"][key]["TEST"],
  53 + mlp_h,dropouts=mlp_dropouts,sgd=mlp_sgd,
  54 + epochs=mlp_epochs,
  55 + batch_size=mlp_batch_size,
  56 + save_pred=False,keep_histo=False,
  57 + loss="categorical_crossentropy",fit_verbose=0)
  58 + arg_best=[]
  59 + dev_best=[]
  60 + arg_best.append(numpy.argmax(res[1]))
  61 + dev_best.append(res[1][arg_best[-1]])
  62 + res[1][arg_best[-1]]=0
  63 + arg_best.append(numpy.argmax(res[1]))
  64 + dev_best.append(res[1][arg_best[-1]])
  65 + res[1][arg_best[-1]]=0
  66 + arg_best.append(numpy.argmax(res[1]))
  67 + dev_best.append(res[1][arg_best[-1]])
  68 + res[1][arg_best[-1]]=0
  69 + arg_best.append(numpy.argmax(res[1]))
  70 + dev_best.append(res[1][arg_best[-1]])
  71 + res[1][arg_best[-1]]=0
  72 + arg_best.append(numpy.argmax(res[1]))
  73 + dev_best.append(res[1][arg_best[-1]])
  74 + res[1][arg_best[-1]]=0
  75 + arg_best.append(numpy.argmax(res[1]))
  76 + dev_best.append(res[1][arg_best[-1]])
  77 + res[1][arg_best[-1]]=0
  78 + arg_best.append(numpy.argmax(res[1]))
  79 + dev_best.append(res[1][arg_best[-1]])
  80 + res[1][arg_best[-1]]=0
  81 + arg_best.append(numpy.argmax(res[1]))
  82 + dev_best.append(res[1][arg_best[-1]])
  83 + res[1][arg_best[-1]]=0
  84 + arg_best.append(numpy.argmax(res[1]))
  85 + dev_best.append(res[1][arg_best[-1]])
  86 + res[1][arg_best[-1]]=0
  87 + arg_best.append(numpy.argmax(res[1]))
  88 + dev_best.append(res[1][arg_best[-1]])
  89 + res[1][arg_best[-1]]=0
  90 + arg_best.append(numpy.argmax(res[1]))
  91 + dev_best.append(res[1][arg_best[-1]])
  92 + res[1][arg_best[-1]]=0
  93 + arg_best.append(numpy.argmax(res[1]))
  94 + dev_best.append(res[1][arg_best[-1]])
  95 + res[1][arg_best[-1]]=0
  96 +
  97 +
  98 +
  99 +
  100 + test_best =[ res[2][x] for x in arg_best ]
  101 + test_max = numpy.max(res[2])
  102 + out_db[key]=(res,(dev_best,test_best,test_max))
  103 + ress.append((key,dev_best,test_best,test_max))
  104 +
  105 +for el in ress :
  106 + print el
  107 +out_db.close()
  108 +origin_corps.close()
LDA/04b-mmf_mini_ae.py
  1 +
  2 +# coding: utf-8
  3 +
  4 +# In[2]:
  5 +
  6 +# Import
  7 +import gensim
  8 +from scipy import sparse
  9 +import itertools
  10 +from sklearn import preprocessing
  11 +from keras.models import Sequential
  12 +from keras.optimizers import SGD,Adam
  13 +from mlp import *
  14 +import sklearn.metrics
  15 +import shelve
  16 +import pickle
  17 +from utils import *
  18 +import sys
  19 +import os
  20 +import json
  21 +# In[4]:
  22 +
  23 +infer_model=shelve.open("{}".format(sys.argv[2]))
  24 +in_dir = sys.argv[1]
  25 +#['ASR', 'TRS', 'LABEL']
  26 +# In[6]:
  27 +
  28 +
  29 +hidden_size=[ 100 , 50, 100 ]
  30 +input_activation="tanh"
  31 +output_activation="tanh"
  32 +loss="mse"
  33 +epochs=1000
  34 +batch=1
  35 +patience=60
  36 +do_do=[False]
  37 +sgd = Adam(lr=0.000001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True)
  38 +
  39 +
  40 +
  41 +mlp_h = [ 150 ,150 ,150 ]
  42 +mlp_loss = "categorical_crossentropy"
  43 +mlp_dropouts = []
  44 +mlp_sgd = Adam(lr=0.0001)
  45 +mlp_epochs = 2000
  46 +mlp_batch_size = 8
  47 +mlp_output_activation="softmax"
  48 +
  49 +try :
  50 + sgd_repr=sgd.get_config()["name"]
  51 +except AttributeError :
  52 + sgd_repr=sgd
  53 +
  54 +try :
  55 + mlp_sgd_repr=mlp_sgd.get_config()["name"]
  56 +except AttributeError :
  57 + mlp_sgd_repr=mlp_sgd
  58 +
  59 +
  60 +params={ "h1" : "_".join([ str(x) for x in hidden_size ]),
  61 + "inside_activation" : input_activation,
  62 + "output_activation" : output_activation,
  63 + "do_dropout": "_".join([str(x) for x in do_do]),
  64 + "loss" : loss,
  65 + "epochs" : epochs ,
  66 + "batch_size" : batch,
  67 + "patience" : patience,
  68 + "sgd" : sgd_repr,
  69 + "mlp_h ": "_".join([str(x) for x in mlp_h]),
  70 + "mlp_loss ": mlp_loss,
  71 + "mlp_dropouts ": "_".join([str(x) for x in mlp_dropouts]),
  72 + "mlp_sgd ": mlp_sgd_repr,
  73 + "mlp_epochs ": mlp_epochs,
  74 + "mlp_batch_size ": mlp_batch_size,
  75 + "mlp_output" : mlp_output_activation
  76 + }
  77 +name = "_".join([ str(x) for x in params.values()])
  78 +try:
  79 + os.mkdir("{}/{}".format(in_dir,name))
  80 +except:
  81 + pass
  82 +db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True)
  83 +db["params"] = params
  84 +db["LABEL"]=infer_model["LABEL"]
  85 +#
  86 +json.dump(params,
  87 + open("{}/{}/ae_model.json".format(in_dir,name),"w"),
  88 + indent=4)
  89 +
  90 +keys = ["ASR","TRS"]
  91 +
  92 +db["AE"] = {}
  93 +db["LDA"] = {}
  94 +for mod in keys :
  95 + print mod
  96 + db["LDA"][mod] = train_mlp(infer_model["LDA"][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"],
  97 + infer_model["LDA"][mod]["DEV"],infer_model["LABEL"][mod]["DEV"],
  98 + infer_model["LDA"][mod]["TEST"],infer_model["LABEL"][mod]["TEST"],
  99 + mlp_h ,sgd=mlp_sgd,
  100 + epochs=mlp_epochs,
  101 + batch_size=mlp_batch_size,
  102 + input_activation=input_activation,
  103 + output_activation=mlp_output_activation,
  104 + dropouts=mlp_dropouts,
  105 + fit_verbose=0)
  106 +
  107 + res=train_ae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"],
  108 + hidden_size,patience = params["patience"],sgd=sgd,
  109 + dropouts=do_do,input_activation=input_activation,output_activation=output_activation,
  110 + loss=loss,epochs=epochs,batch_size=batch,verbose=0)
  111 + mlp_res_list=[]
  112 + for layer in res :
  113 + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"],
  114 + layer[1],infer_model["LABEL"][mod]["DEV"],
  115 + layer[2],infer_model["LABEL"][mod]["TEST"],
  116 + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,
  117 + output_activation=mlp_output_activation,
  118 + input_activation=input_activation,
  119 + batch_size=mlp_batch_size,fit_verbose=0))
  120 + db["AE"][mod]=mlp_res_list
  121 +
  122 +mod = "ASR"
  123 +mod2= "TRS"
  124 +mlp_res_list=[]
  125 +
  126 +res = train_ae(infer_model["LDA"][mod]["TRAIN"],
  127 + infer_model["LDA"][mod]["DEV"],
  128 + infer_model["LDA"][mod]["TEST"],
  129 + hidden_size,dropouts=do_do,patience = params["patience"],
  130 + sgd=sgd,input_activation=input_activation,output_activation=output_activation,loss=loss,epochs=epochs,
  131 + batch_size=batch,
  132 + y_train=infer_model["LDA"][mod]["TRAIN"],
  133 + y_dev=infer_model["LDA"][mod2]["DEV"],
  134 + y_test=infer_model["LDA"][mod2]["TEST"])
  135 +
  136 +for layer in res :
  137 + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"],
  138 + layer[1],infer_model["LABEL"][mod]["DEV"],
  139 + layer[2],infer_model["LABEL"][mod]["TEST"],
  140 + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,
  141 + output_activation=mlp_output_activation,
  142 + input_activation=input_activation,
  143 + batch_size=mlp_batch_size,fit_verbose=0))
  144 +
  145 +db["AE"]["SPE"] = mlp_res_list
  146 +
  147 +db.sync()
  148 +db.close()
  1 +
  2 +# coding: utf-8
  3 +
  4 +# In[2]:
  5 +
  6 +# Import
  7 +import gensim
  8 +from scipy import sparse
  9 +import itertools
  10 +from sklearn import preprocessing
  11 +from keras.models import Sequential
  12 +from keras.optimizers import SGD,Adam
  13 +from mlp import *
  14 +import mlp
  15 +import sklearn.metrics
  16 +import shelve
  17 +import pickle
  18 +from utils import *
  19 +import sys
  20 +import os
  21 +import json
  22 +# In[4]:
  23 +
  24 +infer_model=shelve.open("{}".format(sys.argv[2]))
  25 +in_dir = sys.argv[1]
  26 +#['ASR', 'TRS', 'LABEL']
  27 +# In[6]:
  28 +
  29 +
  30 +hidden_size=[ 100, 80, 50 , 20 ]
  31 +input_activation="relu"
  32 +output_activation="relu"
  33 +loss="mse"
  34 +epochs=3000
  35 +batch=1
  36 +patience=20
  37 +do_do=[ 0 ] * len(hidden_size)
  38 +sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True)
  39 +try :
  40 + sgd_repr=sgd.get_config()["name"]
  41 +except AttributeError :
  42 + sgd_repr=sgd
  43 +
  44 +params={ "h1" : "_".join([str(x) for x in hidden_size]),
  45 + "inside_activation" : input_activation,
  46 + "out_activation" : output_activation,
  47 + "do_dropout": "_".join([str(x) for x in do_do]),
  48 + "loss" : loss,
  49 + "epochs" : epochs ,
  50 + "batch_size" : batch,
  51 + "patience" : patience,
  52 + "sgd" : sgd_repr}
  53 +name = "_".join([ str(x) for x in params.values()])
  54 +try:
  55 + os.mkdir("{}/SAE_{}".format(in_dir,name))
  56 +except:
  57 + pass
  58 +db = shelve.open("{}/SAE_{}/ae_model.shelve".format(in_dir,name),writeback=True)
  59 +#
  60 +json.dump(params,
  61 + open("{}/SAE_{}/ae_model.json".format(in_dir,name),"w"),
  62 + indent=4)
  63 +
  64 +keys = ["ASR","TRS"]
  65 +
  66 +mlp_h = [ 150 , 300 ]
  67 +mlp_loss ="categorical_crossentropy"
  68 +mlp_dropouts = [0,0,0,0]
  69 +mlp_sgd = Adam(0.001)
  70 +mlp_epochs = 2000
  71 +mlp_batch_size = 8
  72 +
  73 +db["SAE"] = {}
  74 +
  75 +db["SAEFT"] = {}
  76 +for mod in keys :
  77 + print "MODE ", mod
  78 + res_tuple=train_sae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],
  79 + infer_model["LDA"][mod]["TEST"],
  80 + hidden_size,dropouts=do_do,
  81 + patience = params["patience"],sgd=sgd,input_activation="tanh",
  82 + output_activation="tanh",loss=loss,epochs=epochs,
  83 + batch_size=batch,verbose=0)
  84 + #print len(res), [len(x) for x in res[0]], [ len(x) for x in res[1]]
  85 + for name , levels in zip(["SAE","SAEFT"],res_tuple):
  86 + print "NAME", name
  87 + mlp_res_by_level = []
  88 + for res in levels:
  89 + mlp_res_list=[]
  90 + for nb,layer in enumerate(res) :
  91 + print "layer NB",nb
  92 + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"],
  93 + layer[1],infer_model["LABEL"][mod]["DEV"],
  94 + layer[2],infer_model["LABEL"][mod]["TEST"],
  95 + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,
  96 + sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size,
  97 + fit_verbose=0))
  98 + mlp_res_by_level.append(mlp_res_list)
  99 + db[name][mod]=mlp_res_by_level
  100 +
  101 +mod = "ASR"
  102 +mod2= "TRS"
  103 +print "mode SPE "
  104 +res_tuple = train_sae(infer_model["LDA"][mod]["TRAIN"],
  105 + infer_model["LDA"][mod]["DEV"],
  106 + infer_model["LDA"][mod]["TEST"],
  107 + hidden_size,dropouts=[0],patience=params["patience"],
  108 + sgd=sgd,input_activation=input_activation,output_activation=input_activation,
  109 + loss=loss,epochs=epochs,batch_size=batch,
  110 + y_train=infer_model["LDA"][mod2]["TRAIN"],
  111 + y_dev=infer_model["LDA"][mod2]["DEV"],
  112 + y_test=infer_model["LDA"][mod2]["TEST"])
  113 +
  114 +for name , levels in zip(["SAE","SAEFT"],res_tuple):
  115 + mlp_res_by_level = []
  116 + for res in levels :
  117 + mlp_res_list=[]
  118 + for layer in res :
  119 + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"],
  120 + layer[1],infer_model["LABEL"][mod]["DEV"],layer[2],
  121 + infer_model["LABEL"][mod]["TEST"],
  122 + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,
  123 + sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size,
  124 + fit_verbose=0))
  125 + mlp_res_by_level.append(mlp_res_list)
  126 + db[name]["SPE"] = mlp_res_by_level
  127 +
  128 +db.close()
  1 +
  2 +# coding: utf-8
  3 +
  4 +# In[2]:
  5 +
  6 +# Import
  7 +import gensim
  8 +from scipy import sparse
  9 +import itertools
  10 +from sklearn import preprocessing
  11 +from keras.models import Sequential
  12 +from keras.optimizers import SGD,Adam
  13 +from mlp import *
  14 +import mlp
  15 +import sklearn.metrics
  16 +import shelve
  17 +import pickle
  18 +from utils import *
  19 +import sys
  20 +import os
  21 +import json
  22 +# In[4]:
  23 +
  24 +infer_model=shelve.open("{}".format(sys.argv[2]))
  25 +in_dir = sys.argv[1]
  26 +#['ASR', 'TRS', 'LABEL']
  27 +# In[6]:
  28 +
  29 +# AE params
  30 +hidden_size=[ 100, 100 ]
  31 +input_activation="relu"
  32 +output_activation="relu"
  33 +loss="mse"
  34 +epochs= 1000
  35 +batch_size=1
  36 +patience=20
  37 +do_do=[ 0.25 ] * len(hidden_size)
  38 +sgd = Adam(lr=0.00001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True)
  39 +try :
  40 + sgd_repr=sgd.get_config()["name"]
  41 +except AttributeError :
  42 + sgd_repr=sgd
  43 +
  44 +# Transforme :
  45 +trans_hidden_size=[ 300 , 300 ]
  46 +trans_input_activation="relu"
  47 +trans_output_activation="relu"
  48 +trans_loss="mse"
  49 +trans_epochs=1000
  50 +trans_batch_size=8
  51 +trans_patience=20
  52 +trans_do=[ 0.25 ] * len(trans_hidden_size)
  53 +trans_sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True)
  54 +try :
  55 + trans_sgd_repr=trans_sgd.get_config()["name"]
  56 +except AttributeError :
  57 + trans_sgd_repr=trans_sgd
  58 +
  59 +
  60 +
  61 +ae={ "h1" : "_".join([str(x) for x in hidden_size]),
  62 + "inside_activation" : input_activation,
  63 + "out_activation" : output_activation,
  64 + "do_dropout": "_".join([str(x) for x in do_do]),
  65 + "loss" : loss,
  66 + "epochs" : epochs ,
  67 + "batch_size" : batch_size,
  68 + "patience" : patience,
  69 + "sgd" : sgd_repr}
  70 +name = "_".join([ str(x) for x in ae.values()])
  71 +
  72 +trans={ "h1" : "_".join([str(x) for x in trans_hidden_size]),
  73 + "inside_activation" : trans_input_activation,
  74 + "out_activation" : trans_output_activation,
  75 + "do_dropout": "_".join([str(x) for x in trans_do]),
  76 + "loss" : trans_loss,
  77 + "epochs" : trans_epochs ,
  78 + "batch_size" : trans_batch_size,
  79 + "patience" : trans_patience,
  80 + "sgd" : trans_sgd_repr}
  81 +
  82 +mlp_h = [ 300 , 300 ]
  83 +mlp_loss ="categorical_crossentropy"
  84 +mlp_dropouts = [0,0,0,0]
  85 +mlp_sgd = Adam(0.0001)
  86 +mlp_epochs = 1000
  87 +mlp_batch_size = 8
  88 +mlp_input_activation = "relu"
  89 +mlp_output_activation = "softmax"
  90 +
  91 +try :
  92 + mlp_sgd_repr=mlp_sgd.get_config()["name"]
  93 +except AttributeError :
  94 + mlp_sgd_repr=mlp_sgd
  95 +
  96 +
  97 +
  98 +mlp={ "h1" : "_".join([str(x) for x in mlp_h ]),
  99 + "inside_activation" : mlp_input_activation,
  100 + "out_activation" : mlp_output_activation,
  101 + "do_dropout": "_".join([str(x) for x in mlp_dropouts]),
  102 + "loss" : mlp_loss,
  103 + "epochs" : mlp_epochs ,
  104 + "batch_size" : mlp_batch_size,
  105 + "sgd" : mlp_sgd_repr}
  106 +
  107 +params = { "ae":ae, "trans":trans, "mlp":mlp}
  108 +try:
  109 + os.mkdir("{}/DSAE_{}".format(in_dir,name))
  110 +except:
  111 + pass
  112 +db = shelve.open("{}/DSAE_{}/ae_model.shelve".format(in_dir,name),writeback=True)
  113 +#
  114 +json.dump(params,
  115 + open("{}/DSAE_{}/ae_model.json".format(in_dir,name),"w"),
  116 + indent=4)
  117 +
  118 +keys = ["ASR","TRS"]
  119 +
  120 +
  121 +
  122 +db["DSAE"] = {}
  123 +
  124 +db["DSAEFT"] = {}
  125 +mod = "ASR"
  126 +res_tuple_ASR = train_ae(infer_model["LDA"][mod]["TRAIN"],
  127 + infer_model["LDA"][mod]["DEV"],
  128 + infer_model["LDA"][mod]["TEST"],
  129 + hidden_size,dropouts=do_do,
  130 + patience = patience,sgd=sgd,
  131 + input_activation=input_activation,
  132 + output_activation=output_activation,loss=loss,epochs=epochs,
  133 + batch_size=batch_size,verbose=0,get_weights=True)
  134 +mlp_res_list = []
  135 +for layer in res_tuple_ASR[0]:
  136 + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"],
  137 + layer[1],infer_model["LABEL"][mod]["DEV"],
  138 + layer[2],infer_model["LABEL"][mod]["TEST"],
  139 + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,
  140 + sgd=mlp_sgd,epochs=mlp_epochs,
  141 + output_activation=mlp_output_activation,
  142 + input_activation=mlp_input_activation,
  143 + batch_size=mlp_batch_size,fit_verbose=0))
  144 +
  145 +db["DSAE"][mod] = mlp_res_list
  146 +mod = "TRS"
  147 +print hidden_size
  148 +res_tuple_TRS = train_ae(infer_model["LDA"][mod]["TRAIN"],
  149 + infer_model["LDA"][mod]["DEV"],
  150 + infer_model["LDA"][mod]["TEST"],
  151 + hidden_size,dropouts=do_do,
  152 + sgd=sgd,input_activation=input_activation,
  153 + output_activation=output_activation,loss=loss,epochs=epochs,
  154 + batch_size=batch_size,patience=patience,
  155 + verbose=0,get_weights=True)
  156 +
  157 +mlp_res_list = []
  158 +for layer in res_tuple_TRS[0]:
  159 + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"],
  160 + layer[1],infer_model["LABEL"][mod]["DEV"],
  161 + layer[2],infer_model["LABEL"][mod]["TEST"],
  162 + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,
  163 + sgd=mlp_sgd,epochs=mlp_epochs,
  164 + output_activation=mlp_output_activation,
  165 + input_activation=mlp_input_activation,
  166 + batch_size=mlp_batch_size,fit_verbose=0))
  167 +
  168 +db["DSAE"][mod] = mlp_res_list
  169 +
  170 +
  171 +
  172 +transfert = []
  173 +
  174 +print " get weight trans"
  175 +
  176 +for asr_pred, trs_pred in zip(res_tuple_ASR[0], res_tuple_TRS[0]):
  177 + print "ASR", [ x.shape for x in asr_pred]
  178 +
  179 + print "TRS", [ x.shape for x in trs_pred]
  180 + print
  181 +
  182 +for asr_pred, trs_pred in zip(res_tuple_ASR[0], res_tuple_TRS[0]):
  183 + print "ASR", [ x.shape for x in asr_pred]
  184 +
  185 + print "TRS", [ x.shape for x in trs_pred]
  186 + transfert.append( train_ae(asr_pred[0],
  187 + asr_pred[1],
  188 + asr_pred[2],
  189 + trans_hidden_size,
  190 + dropouts=trans_do,
  191 + y_train = trs_pred[0],
  192 + y_dev=trs_pred[1],
  193 + y_test = trs_pred[2],
  194 + patience = trans_patience,sgd=trans_sgd,
  195 + input_activation=trans_input_activation,
  196 + output_activation=trans_output_activation,
  197 + loss=trans_loss,
  198 + epochs=trans_epochs,
  199 + batch_size=trans_batch_size,verbose=0,get_weights=True) )
  200 +mod = "ASR"
  201 +mlp_res_bylvl = []
  202 +print " MLP on transfert "
  203 +for level, w in transfert :
  204 + mlp_res_list = []
  205 + for layer in level :
  206 + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"],
  207 + layer[1],infer_model["LABEL"][mod]["DEV"],
  208 + layer[2],infer_model["LABEL"][mod]["TEST"],
  209 + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,
  210 + sgd=mlp_sgd,epochs=mlp_epochs,
  211 + output_activation=mlp_output_activation,
  212 + input_activation=mlp_input_activation,
  213 + batch_size=mlp_batch_size,fit_verbose=0))
  214 + mlp_res_bylvl.append(mlp_res_list)
  215 +db["DSAE"]["transfert"] = mlp_res_bylvl
  216 +
  217 +
  218 +print " FT "
  219 +WA = res_tuple_ASR[1]
  220 +print "WA", len(WA), [ len(x) for x in WA]
  221 +WT = res_tuple_TRS[1]
  222 +
  223 +print "WT", len(WT), [ len(x) for x in WT]
  224 +Wtr = [ x[1] for x in transfert]
  225 +
  226 +print "Wtr", len(Wtr), [ len(x) for x in Wtr],[ len(x[1]) for x in Wtr]
  227 +
  228 +ft_res = ft_dsae(infer_model["LDA"]["ASR"]["TRAIN"],
  229 + infer_model["LDA"]["ASR"]["DEV"],
  230 + infer_model["LDA"]["ASR"]["TEST"],
  231 + y_train=infer_model["LDA"]["TRS"]["TRAIN"],
  232 + y_dev=infer_model["LDA"]["TRS"]["DEV"],
  233 + y_test=infer_model["LDA"]["TRS"]["TEST"],
  234 + ae_hidden = hidden_size,
  235 + transfer_hidden = trans_hidden_size,
  236 + start_weights = WA,
  237 + transfer_weights = Wtr,
  238 + end_weights = WT,
  239 + input_activation = input_activation,
  240 + output_activation = output_activation,
  241 + ae_dropouts= do_do,
  242 + transfer_do = trans_do,
  243 + sgd = sgd,
  244 + loss = loss ,
  245 + patience = patience,
  246 + batch_size = batch_size,
  247 + epochs= epochs)
  248 +mlps_by_lvls= []
  249 +for level in ft_res :
  250 + mlp_res_list = []
  251 + for layer in level :
  252 + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"],
  253 + layer[1],infer_model["LABEL"][mod]["DEV"],
  254 + layer[2],infer_model["LABEL"][mod]["TEST"],
  255 + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,
  256 + sgd=mlp_sgd,epochs=mlp_epochs,
  257 + output_activation=mlp_output_activation,
  258 + input_activation=mlp_input_activation,
  259 + batch_size=mlp_batch_size,fit_verbose=0))
  260 + mlps_by_lvls.append(mlp_res_list)
  261 +
  262 +
  263 +db["DSAEFT"]["transfert"] = mlps_by_lvls
  264 +
  265 +db.close()
  1 +
  2 +# coding: utf-8
  3 +
  4 +# In[2]:
  5 +
  6 +# Import
  7 +import gensim
  8 +from scipy import sparse
  9 +import itertools
  10 +from sklearn import preprocessing
  11 +from keras.models import Sequential
  12 +from keras.optimizers import SGD,Adam
  13 +from mlp import *
  14 +from vae import *
  15 +import sklearn.metrics
  16 +import shelve
  17 +import pickle
  18 +from utils import *
  19 +import sys
  20 +import os
  21 +import json
  22 +# In[4]:
  23 +
  24 +infer_model=shelve.open("{}".format(sys.argv[2]))
  25 +in_dir = sys.argv[1]
  26 +#['ASR', 'TRS', 'LABEL']
  27 +# In[6]:
  28 +
  29 +
  30 +hidden_size= [60]
  31 +input_activation="tanh"
  32 +output_activation="sigmoid"
  33 +epochs=300
  34 +batch=1
  35 +patience=60
  36 +sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True)
  37 +latent_dim = 30
  38 +
  39 +
  40 +
  41 +mlp_h = [ 256 ]
  42 +mlp_loss = "categorical_crossentropy"
  43 +mlp_dropouts = []
  44 +mlp_sgd = Adam(lr=0.001)
  45 +mlp_epochs = 1000
  46 +mlp_batch_size = 16
  47 +mlp_output_activation="softmax"
  48 +
  49 +try :
  50 + sgd_repr=sgd.get_config()["name"]
  51 +except AttributeError :
  52 + sgd_repr=sgd
  53 +
  54 +try :
  55 + mlp_sgd_repr=mlp_sgd.get_config()["name"]
  56 +except AttributeError :
  57 + mlp_sgd_repr=mlp_sgd
  58 +
  59 +
  60 +params={ "h1" : "_".join([ str(x) for x in hidden_size ]),
  61 + "inside_activation" : input_activation,
  62 + "output_activation" : output_activation,
  63 + "epochs" : epochs ,
  64 + "batch_size" : batch,
  65 + "patience" : patience,
  66 + "sgd" : sgd_repr,
  67 + "mlp_h ": "_".join([str(x) for x in mlp_h]),
  68 + "mlp_loss ": mlp_loss,
  69 + "mlp_dropouts ": "_".join([str(x) for x in mlp_dropouts]),
  70 + "mlp_sgd ": mlp_sgd_repr,
  71 + "mlp_epochs ": mlp_epochs,
  72 + "mlp_batch_size ": mlp_batch_size,
  73 + "mlp_output" : mlp_output_activation
  74 + }
  75 +name = "_".join([ str(x) for x in params.values()])
  76 +try:
  77 + os.mkdir("{}/VAE_{}".format(in_dir,name))
  78 +except:
  79 + pass
  80 +db = shelve.open("{}/VAE_{}/ae_model.shelve".format(in_dir,name),writeback=True)
  81 +db["params"] = params
  82 +db["LABEL"]=infer_model["LABEL"]
  83 +#
  84 +json.dump(params,
  85 + open("{}/VAE_{}/ae_model.json".format(in_dir,name),"w"),
  86 + indent=4)
  87 +
  88 +keys = ["ASR","TRS"]
  89 +
  90 +db["VAE"] = {}
  91 +db["LDA"] = {}
  92 +for mod in keys :
  93 + print mod
  94 + db["LDA"][mod] = train_mlp(infer_model["LDA"][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"],
  95 + infer_model["LDA"][mod]["DEV"],infer_model["LABEL"][mod]["DEV"],
  96 + infer_model["LDA"][mod]["TEST"],infer_model["LABEL"][mod]["TEST"],
  97 + mlp_h ,sgd=mlp_sgd,
  98 + epochs=mlp_epochs,
  99 + batch_size=mlp_batch_size,
  100 + input_activation=input_activation,
  101 + output_activation=mlp_output_activation,
  102 + dropouts=mlp_dropouts,
  103 + fit_verbose=0)
  104 +
  105 + res=train_vae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"],
  106 + hidden_size=hidden_size[0],
  107 + latent_dim=latent_dim,sgd=sgd,
  108 + input_activation=input_activation,output_activation=output_activation,
  109 + nb_epochs=epochs,batch_size=batch)
  110 + mlp_res_list=[]
  111 + for layer in res :
  112 + mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"],
  113 + layer[1],infer_model["LABEL"][mod]["DEV"],
  114 + layer[2],infer_model["LABEL"][mod]["TEST"],
  115 + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,
  116 + output_activation=mlp_output_activation,
  117 + input_activation=input_activation,
  118 + batch_size=mlp_batch_size,fit_verbose=0))
  119 + db["VAE"][mod]=mlp_res_list
  120 +
  121 +mod = "ASR"
  122 +mod2= "TRS"
  123 +mlp_res_list=[]
  124 +
  125 +res = train_vae(infer_model["LDA"][mod]["TRAIN"],
  126 + infer_model["LDA"][mod]["DEV"],
  127 + infer_model["LDA"][mod]["TEST"],
  128 + hidden_size=hidden_size[0],
  129 + sgd=sgd,input_activation=input_activation,output_activation=output_activation,
  130 + latent_dim=latent_dim,
  131 + nb_epochs=epochs,
  132 + batch_size=batch,
  133 + y_train=infer_model["LDA"][mod2]["TRAIN"],
  134 + y_dev=infer_model["LDA"][mod2]["DEV"],
  135 + y_test=infer_model["LDA"][mod2]["TEST"])
  136 +
  137 +for layer in res :
  138 + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"],
  139 + layer[1],infer_model["LABEL"][mod]["DEV"],
  140 + layer[2],infer_model["LABEL"][mod]["TEST"],
  141 + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs,
  142 + output_activation=mlp_output_activation,
  143 + input_activation=input_activation,
  144 + batch_size=mlp_batch_size,fit_verbose=0))
  145 +
  146 +db["VAE"]["SPE"] = mlp_res_list
  147 +
  148 +db.sync()
  149 +db.close()
LDA/05-mmf_getscore.py
  1 +import numpy as np
  2 +import shelve
  3 +import sys
  4 +import glob
  5 +from collections import defaultdict
  6 +from tinydb import TinyDB, Query
  7 +from mako.template import Template
  8 +import time
  9 +
  10 +def get_best(x):
  11 + argbest=np.argmax(x[1])
  12 + maxdev=x[1][argbest]
  13 + maxtrain=np.max(x[0])
  14 + maxtest=np.max(x[2])
  15 + besttest=x[2][argbest]
  16 + return ( maxtrain,maxdev,maxtest,besttest)
  17 +depth = lambda L: isinstance(L, list) and max(map(depth, L))+1
  18 +
  19 +
  20 +template_name = '''
  21 +${name}
  22 +========================
  23 +
  24 +MLP scores :
  25 +-------------------
  26 +'''
  27 +template_value='''\n\n
  28 +| ${model} ${ttype} | train | dev |max test| best test|
  29 +| -------------------:|:--------:|:---------:|:------:|:--------:|
  30 +% for cpt,line in enumerate(models[model][ttype]):
  31 +| ${cpt} | ${line[0]} | ${line[1]} |${line[2]} | ${line[3]} |
  32 +% endfor
  33 +\n
  34 +'''
  35 +
  36 +# ae_model.shelve
  37 +def get_folder_file(x):
  38 + folder=x.split("/")[1]
  39 + shelve_file = ".".join(x.split(".")[:-1])
  40 + return(folder,shelve_file)
  41 +
  42 +in_folder = sys.argv[1]