Commit 2af8e57f4e1ebcfdd5ba9d3e8963c4853e472982
1 parent
e5108393c8
Exists in
master
change all
Showing 9 changed files with 428 additions and 124 deletions Side-by-side Diff
LDA/04a-mmdf.py
... | ... | @@ -31,8 +31,11 @@ |
31 | 31 | #db=shelve.open("SPELIKE_MLP_DB.shelve",writeback=True) |
32 | 32 | origin_corps=shelve.open("{}".format(sys.argv[2])) |
33 | 33 | in_dir = sys.argv[1] |
34 | +if len(sys.argv) > 3 : | |
35 | + features_key = sys.argv[3] | |
36 | +else : | |
37 | + features_key = "LDA" | |
34 | 38 | |
35 | - | |
36 | 39 | out_db=shelve.open("{}/mlp_scores.shelve".format(in_dir),writeback=True) |
37 | 40 | |
38 | 41 | mlp_h = [ 250, 250 ] |
39 | 42 | |
40 | 43 | |
... | ... | @@ -40,16 +43,16 @@ |
40 | 43 | mlp_dropouts = [0.25]* len(mlp_h) |
41 | 44 | mlp_sgd = Adam(lr=0.0001) |
42 | 45 | mlp_epochs = 3000 |
43 | -mlp_batch_size = 1 | |
46 | +mlp_batch_size = 5 | |
44 | 47 | mlp_input_activation = "relu" |
45 | 48 | mlp_output_activation="softmax" |
46 | 49 | |
47 | 50 | ress = [] |
48 | -for key in ["TRS", "ASR"] : | |
51 | +for key in origin_corps["features_key"].keys() : | |
49 | 52 | |
50 | - res=mlp.train_mlp(origin_corps["LDA"][key]["TRAIN"],origin_corps["LABEL"][key]["TRAIN"], | |
51 | - origin_corps["LDA"][key]["DEV"],origin_corps["LABEL"][key]["DEV"], | |
52 | - origin_corps["LDA"][key]["TEST"],origin_corps["LABEL"][key]["TEST"], | |
53 | + res=mlp.train_mlp(origin_corps[features_key][key]["TRAIN"],origin_corps["LABEL"][key]["TRAIN"], | |
54 | + origin_corps[features_key][key]["DEV"],origin_corps["LABEL"][key]["DEV"], | |
55 | + origin_corps[features_key][key]["TEST"],origin_corps["LABEL"][key]["TEST"], | |
53 | 56 | mlp_h,dropouts=mlp_dropouts,sgd=mlp_sgd, |
54 | 57 | epochs=mlp_epochs, |
55 | 58 | batch_size=mlp_batch_size, |
LDA/04b-mmf_mini_ae.py
... | ... | @@ -10,6 +10,7 @@ |
10 | 10 | from sklearn import preprocessing |
11 | 11 | from keras.models import Sequential |
12 | 12 | from keras.optimizers import SGD,Adam |
13 | +from keras.layers.advanced_activations import ELU,PReLU | |
13 | 14 | from mlp import * |
14 | 15 | import sklearn.metrics |
15 | 16 | import shelve |
16 | 17 | |
... | ... | @@ -24,12 +25,24 @@ |
24 | 25 | in_dir = sys.argv[1] |
25 | 26 | #['ASR', 'TRS', 'LABEL'] |
26 | 27 | # In[6]: |
27 | - | |
28 | +if len(sys.argv) > 4 : | |
29 | + features_key = sys.argv[4] | |
30 | +else : | |
31 | + features_key = "LDA" | |
32 | +save_projection = True | |
28 | 33 | json_conf =json.load(open(sys.argv[3])) |
29 | 34 | ae_conf = json_conf["ae"] |
30 | 35 | |
31 | 36 | hidden_size= ae_conf["hidden_size"] |
32 | -input_activation=ae_conf["input_activation"] | |
37 | +input_activation = None | |
38 | +print ae_conf["input_activation"] | |
39 | +if ae_conf["input_activation"] == "elu": | |
40 | + print " ELU" | |
41 | + input_activation = PReLU() | |
42 | +else: | |
43 | + print " ELSE" | |
44 | + input_activation = ae_conf["input_activation"] | |
45 | +#input_activation=ae_conf["input_activation"] | |
33 | 46 | output_activation=ae_conf["output_activation"] |
34 | 47 | loss=ae_conf["loss"] |
35 | 48 | epochs=ae_conf["epochs"] |
36 | 49 | |
37 | 50 | |
... | ... | @@ -72,14 +85,18 @@ |
72 | 85 | db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True) |
73 | 86 | db["LABEL"]=infer_model["LABEL"] |
74 | 87 | # |
75 | -keys = ["ASR","TRS"] | |
88 | +keys = infer_model[features_key].keys() | |
76 | 89 | |
77 | 90 | db["AE"] = {} |
78 | -db["LDA"] = {} | |
91 | +db[features_key] = {} | |
79 | 92 | for mod in keys : |
80 | - db["LDA"][mod] = train_mlp(infer_model["LDA"][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"], | |
81 | - infer_model["LDA"][mod]["DEV"],infer_model["LABEL"][mod]["DEV"], | |
82 | - infer_model["LDA"][mod]["TEST"],infer_model["LABEL"][mod]["TEST"], | |
93 | + print infer_model[features_key][mod]["TRAIN"].shape | |
94 | + print infer_model[features_key][mod]["DEV"].shape | |
95 | + print infer_model[features_key][mod]["TEST"].shape | |
96 | + | |
97 | + db[features_key][mod] = train_mlp(infer_model[features_key][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"], | |
98 | + infer_model[features_key][mod]["DEV"],infer_model["LABEL"][mod]["DEV"], | |
99 | + infer_model[features_key][mod]["TEST"],infer_model["LABEL"][mod]["TEST"], | |
83 | 100 | mlp_h ,sgd=mlp_sgd, |
84 | 101 | epochs=mlp_epochs, |
85 | 102 | batch_size=mlp_batch_size, |
86 | 103 | |
... | ... | @@ -87,13 +104,25 @@ |
87 | 104 | output_activation=mlp_output_activation, |
88 | 105 | dropouts=mlp_dropouts, |
89 | 106 | fit_verbose=0) |
90 | - | |
91 | - res=train_ae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"], | |
107 | + print input_activation | |
108 | + res=train_ae(infer_model[features_key][mod]["TRAIN"],infer_model[features_key][mod]["DEV"],infer_model[features_key][mod]["TEST"], | |
92 | 109 | hidden_size,patience = patience,sgd=sgd, |
93 | 110 | dropouts=do_do,input_activation=input_activation,output_activation=output_activation, |
94 | 111 | loss=loss,epochs=epochs,batch_size=batch,verbose=0) |
95 | 112 | mlp_res_list=[] |
96 | - for layer in res : | |
113 | + for nb,layer in enumerate(res) : | |
114 | + if save_projection: | |
115 | + pd = pandas.DataFrame(layer[0]) | |
116 | + col_count = (pd.sum(axis=0) != 0) | |
117 | + pd = pd.loc[:,col_count] | |
118 | + pd.to_hdf("{}/{}/AE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"TRAIN") | |
119 | + pd = pandas.DataFrame(layer[1]) | |
120 | + pd = pd.loc[:,col_count] | |
121 | + pd.to_hdf("{}/{}/AE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"DEV") | |
122 | + pd = pandas.DataFrame(layer[2]) | |
123 | + pd = pd.loc[:,col_count] | |
124 | + pd.to_hdf("{}/{}/AE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"TEST") | |
125 | + del pd | |
97 | 126 | mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], |
98 | 127 | layer[1],infer_model["LABEL"][mod]["DEV"], |
99 | 128 | layer[2],infer_model["LABEL"][mod]["TEST"], |
100 | 129 | |
101 | 130 | |
102 | 131 | |
... | ... | @@ -103,30 +132,44 @@ |
103 | 132 | batch_size=mlp_batch_size,fit_verbose=0)) |
104 | 133 | db["AE"][mod]=mlp_res_list |
105 | 134 | |
106 | -mod = "ASR" | |
107 | -mod2= "TRS" | |
108 | -mlp_res_list=[] | |
135 | +if "ASR" in keys and "TRS" in keys: | |
136 | + mod = "ASR" | |
137 | + mod2= "TRS" | |
138 | + mlp_res_list=[] | |
109 | 139 | |
110 | -res = train_ae(infer_model["LDA"][mod]["TRAIN"], | |
111 | - infer_model["LDA"][mod]["DEV"], | |
112 | - infer_model["LDA"][mod]["TEST"], | |
113 | - hidden_size,dropouts=do_do,patience = patience, | |
114 | - sgd=sgd,input_activation=input_activation,output_activation=output_activation,loss=loss,epochs=epochs, | |
115 | - batch_size=batch, | |
116 | - y_train=infer_model["LDA"][mod]["TRAIN"], | |
117 | - y_dev=infer_model["LDA"][mod2]["DEV"], | |
118 | - y_test=infer_model["LDA"][mod2]["TEST"]) | |
140 | + res = train_ae(infer_model[features_key][mod]["TRAIN"], | |
141 | + infer_model[features_key][mod]["DEV"], | |
142 | + infer_model[features_key][mod]["TEST"], | |
143 | + hidden_size,dropouts=do_do,patience = patience, | |
144 | + sgd=sgd,input_activation=input_activation,output_activation=output_activation,loss=loss,epochs=epochs, | |
145 | + batch_size=batch, | |
146 | + y_train=infer_model[features_key][mod]["TRAIN"], | |
147 | + y_dev=infer_model[features_key][mod2]["DEV"], | |
148 | + y_test=infer_model[features_key][mod2]["TEST"]) | |
119 | 149 | |
120 | -for layer in res : | |
121 | - mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], | |
122 | - layer[1],infer_model["LABEL"][mod]["DEV"], | |
123 | - layer[2],infer_model["LABEL"][mod]["TEST"], | |
124 | - mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, | |
125 | - output_activation=mlp_output_activation, | |
126 | - input_activation=input_activation, | |
127 | - batch_size=mlp_batch_size,fit_verbose=0)) | |
150 | + for nb,layer in enumerate(res) : | |
151 | + if save_projection: | |
152 | + pd = pandas.DataFrame(layer[0]) | |
153 | + col_count= (pd.sum(axis=0) != 0) | |
154 | + pd = pd.loc[:,col_count] | |
155 | + pd.to_hdf("{}/{}/AE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"TRAIN") | |
156 | + pd = pandas.DataFrame(layer[1]) | |
157 | + pd = pd.loc[:,col_count] | |
158 | + pd.to_hdf("{}/{}/AE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"DEV") | |
159 | + pd = pandas.DataFrame(layer[2]) | |
160 | + pd = pd.loc[:,col_count] | |
161 | + pd.to_hdf("{}/{}/AE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"TEST") | |
162 | + del pd | |
128 | 163 | |
129 | -db["AE"]["SPE"] = mlp_res_list | |
164 | + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], | |
165 | + layer[1],infer_model["LABEL"][mod]["DEV"], | |
166 | + layer[2],infer_model["LABEL"][mod]["TEST"], | |
167 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, | |
168 | + output_activation=mlp_output_activation, | |
169 | + input_activation=input_activation, | |
170 | + batch_size=mlp_batch_size,fit_verbose=0)) | |
171 | + | |
172 | + db["AE"]["SPE"] = mlp_res_list | |
130 | 173 | |
131 | 174 | db.sync() |
132 | 175 | db.close() |
LDA/04c-mmf_sae.py
... | ... | @@ -23,6 +23,11 @@ |
23 | 23 | |
24 | 24 | infer_model=shelve.open("{}".format(sys.argv[2])) |
25 | 25 | in_dir = sys.argv[1] |
26 | +if len(sys.argv) > 4 : | |
27 | + features_key = sys.argv[4] | |
28 | +else : | |
29 | + features_key = "LDA" | |
30 | +save_projection = True | |
26 | 31 | #['ASR', 'TRS', 'LABEL'] |
27 | 32 | # In[6]: |
28 | 33 | json_conf =json.load(open(sys.argv[3])) |
29 | 34 | |
... | ... | @@ -47,13 +52,13 @@ |
47 | 52 | sgd = sae_conf["sgd"] |
48 | 53 | |
49 | 54 | name = json_conf["name"] |
55 | +print name | |
50 | 56 | try: |
51 | 57 | os.mkdir("{}/{}".format(in_dir,name)) |
52 | 58 | except: |
53 | 59 | pass |
54 | 60 | db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True) |
55 | 61 | # |
56 | -keys = ["ASR","TRS"] | |
57 | 62 | mlp_conf = json_conf["mlp"] |
58 | 63 | mlp_h = mlp_conf["hidden_size"] |
59 | 64 | mlp_loss = mlp_conf["loss"] |
60 | 65 | |
61 | 66 | |
62 | 67 | |
63 | 68 | |
... | ... | @@ -72,23 +77,38 @@ |
72 | 77 | except : |
73 | 78 | mlp_sgd = mlp_conf["sgd"] |
74 | 79 | |
75 | - | |
80 | +keys = infer_model[features_key].keys() | |
76 | 81 | db["SAE"] = {} |
77 | 82 | |
78 | 83 | db["SAEFT"] = {} |
79 | 84 | for mod in keys : |
80 | - res_tuple=train_sae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"], | |
81 | - infer_model["LDA"][mod]["TEST"], | |
85 | + res_tuple=train_sae(infer_model[features_key][mod]["TRAIN"],infer_model[features_key][mod]["DEV"], | |
86 | + infer_model[features_key][mod]["TEST"], | |
82 | 87 | hidden_size,dropouts=do_do, |
83 | 88 | patience = "patience",sgd=sgd,input_activation="tanh", |
84 | 89 | output_activation="tanh",loss=loss,epochs=epochs, |
85 | 90 | batch_size=batch,verbose=0) |
86 | 91 | #print len(res), [len(x) for x in res[0]], [ len(x) for x in res[1]] |
87 | - for name , levels in zip(["SAE","SAEFT"],res_tuple): | |
92 | + for i, levels in zip(["SAE","SAEFT"],res_tuple): | |
88 | 93 | mlp_res_by_level = [] |
89 | - for res in levels: | |
94 | + for lvl,res in enumerate(levels): | |
90 | 95 | mlp_res_list=[] |
91 | 96 | for nb,layer in enumerate(res) : |
97 | + if save_projection: | |
98 | + pd = pandas.DataFrame(layer[0]) | |
99 | + col_count= (pd.sum(axis=0) != 0) | |
100 | + pd = pd.loc[:,col_count] | |
101 | + hdffile = "{}/{}/{}_{}_{}_df.hdf".format(in_dir,name,i,lvl,nb,mod) | |
102 | + print hdffile | |
103 | + pd.to_hdf(hdffile,"TRAIN") | |
104 | + pd = pandas.DataFrame(layer[1]) | |
105 | + pd = pd.loc[:,col_count] | |
106 | + pd.to_hdf(hdffile,"DEV") | |
107 | + pd = pandas.DataFrame(layer[2]) | |
108 | + pd = pd.loc[:,col_count] | |
109 | + pd.to_hdf(hdffile,"TEST") | |
110 | + del pd | |
111 | + | |
92 | 112 | mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], |
93 | 113 | layer[1],infer_model["LABEL"][mod]["DEV"], |
94 | 114 | layer[2],infer_model["LABEL"][mod]["TEST"], |
95 | 115 | |
96 | 116 | |
... | ... | @@ -96,33 +116,48 @@ |
96 | 116 | sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size, |
97 | 117 | fit_verbose=0)) |
98 | 118 | mlp_res_by_level.append(mlp_res_list) |
99 | - db[name][mod]=mlp_res_by_level | |
119 | + db[i][mod]=mlp_res_by_level | |
100 | 120 | |
101 | -mod = "ASR" | |
102 | -mod2= "TRS" | |
103 | -res_tuple = train_sae(infer_model["LDA"][mod]["TRAIN"], | |
104 | - infer_model["LDA"][mod]["DEV"], | |
105 | - infer_model["LDA"][mod]["TEST"], | |
106 | - hidden_size,dropouts=[0],patience="patience", | |
107 | - sgd=sgd,input_activation=input_activation,output_activation=input_activation, | |
108 | - loss=loss,epochs=epochs,batch_size=batch, | |
109 | - y_train=infer_model["LDA"][mod2]["TRAIN"], | |
110 | - y_dev=infer_model["LDA"][mod2]["DEV"], | |
111 | - y_test=infer_model["LDA"][mod2]["TEST"]) | |
112 | 121 | |
113 | -for name , levels in zip(["SAE","SAEFT"],res_tuple): | |
114 | - mlp_res_by_level = [] | |
115 | - for res in levels : | |
116 | - mlp_res_list=[] | |
117 | - for layer in res : | |
118 | - mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], | |
119 | - layer[1],infer_model["LABEL"][mod]["DEV"],layer[2], | |
120 | - infer_model["LABEL"][mod]["TEST"], | |
121 | - mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, | |
122 | - sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size, | |
123 | - fit_verbose=0)) | |
124 | - mlp_res_by_level.append(mlp_res_list) | |
125 | - db[name]["SPE"] = mlp_res_by_level | |
122 | +if "ASR" in keys and "TRS" in keys : | |
123 | + mod = "ASR" | |
124 | + mod2= "TRS" | |
125 | + res_tuple = train_sae(infer_model[features_key][mod]["TRAIN"], | |
126 | + infer_model[features_key][mod]["DEV"], | |
127 | + infer_model[features_key][mod]["TEST"], | |
128 | + hidden_size,dropouts=[0],patience="patience", | |
129 | + sgd=sgd,input_activation=input_activation,output_activation=input_activation, | |
130 | + loss=loss,epochs=epochs,batch_size=batch, | |
131 | + y_train=infer_model[features_key][mod2]["TRAIN"], | |
132 | + y_dev=infer_model[features_key][mod2]["DEV"], | |
133 | + y_test=infer_model[features_key][mod2]["TEST"]) | |
134 | + | |
135 | + for i , levels in zip(["SAE","SAEFT"],res_tuple): | |
136 | + mlp_res_by_level = [] | |
137 | + for lvl,res in enumerate(levels) : | |
138 | + mlp_res_list=[] | |
139 | + for nb,layer in enumerate(res) : | |
140 | + if save_projection: | |
141 | + pd = pandas.DataFrame(layer[0]) | |
142 | + col_count= (pd.sum(axis=0) != 0) | |
143 | + pd = pd.loc[:,col_count] | |
144 | + pd.to_hdf("{}/{}/{}_{}_{}_{}_df.hdf".format(in_dir,name,i,lvl,nb,"SPE"),"TRAIN") | |
145 | + pd = pandas.DataFrame(layer[1]) | |
146 | + pd = pd.loc[:,col_count] | |
147 | + pd.to_hdf("{}/{}/{}_{}_{}_{}_df.hdf".format(in_dir,name,i,lvl,nb,"SPE"),"DEV") | |
148 | + pd = pandas.DataFrame(layer[2]) | |
149 | + pd = pd.loc[:,col_count] | |
150 | + pd.to_hdf("{}/{}/{}_{}_{}_{}_df.hdf".format(in_dir,name,i,lvl,nb,"SPE"),"TEST") | |
151 | + del pd | |
152 | + | |
153 | + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], | |
154 | + layer[1],infer_model["LABEL"][mod]["DEV"],layer[2], | |
155 | + infer_model["LABEL"][mod]["TEST"], | |
156 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, | |
157 | + sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size, | |
158 | + fit_verbose=0)) | |
159 | + mlp_res_by_level.append(mlp_res_list) | |
160 | + db[i]["SPE"] = mlp_res_by_level | |
126 | 161 | |
127 | 162 | db.sync() |
128 | 163 | db.close() |
LDA/04d-mmf_dsae.py
... | ... | @@ -26,6 +26,10 @@ |
26 | 26 | in_dir = sys.argv[1] |
27 | 27 | #['ASR', 'TRS', 'LABEL'] |
28 | 28 | # In[6]: |
29 | +if len(sys.argv) > 4 : | |
30 | + features_key = sys.argv[4] | |
31 | +else : | |
32 | + features_key = "LDA" | |
29 | 33 | |
30 | 34 | json_conf =json.load(open(sys.argv[3])) |
31 | 35 | |
... | ... | @@ -101,9 +105,9 @@ |
101 | 105 | |
102 | 106 | db["DSAEFT"] = {} |
103 | 107 | mod = "ASR" |
104 | -res_tuple_ASR = train_ae(infer_model["LDA"][mod]["TRAIN"], | |
105 | - infer_model["LDA"][mod]["DEV"], | |
106 | - infer_model["LDA"][mod]["TEST"], | |
108 | +res_tuple_ASR = train_ae(infer_model[features_key][mod]["TRAIN"], | |
109 | + infer_model[features_key][mod]["DEV"], | |
110 | + infer_model[features_key][mod]["TEST"], | |
107 | 111 | hidden_size,dropouts=do_do, |
108 | 112 | patience = patience,sgd=sgd, |
109 | 113 | input_activation=input_activation, |
... | ... | @@ -122,9 +126,9 @@ |
122 | 126 | |
123 | 127 | db["DSAE"][mod] = mlp_res_list |
124 | 128 | mod = "TRS" |
125 | -res_tuple_TRS = train_ae(infer_model["LDA"][mod]["TRAIN"], | |
126 | - infer_model["LDA"][mod]["DEV"], | |
127 | - infer_model["LDA"][mod]["TEST"], | |
129 | +res_tuple_TRS = train_ae(infer_model[features_key][mod]["TRAIN"], | |
130 | + infer_model[features_key][mod]["DEV"], | |
131 | + infer_model[features_key][mod]["TEST"], | |
128 | 132 | hidden_size,dropouts=do_do, |
129 | 133 | sgd=sgd,input_activation=input_activation, |
130 | 134 | output_activation=output_activation,loss=loss,epochs=epochs, |
... | ... | @@ -202,12 +206,12 @@ |
202 | 206 | |
203 | 207 | #print "Wtr", len(Wtr), [ len(x) for x in Wtr],[ len(x[1]) for x in Wtr] |
204 | 208 | |
205 | -ft_res = ft_dsae(infer_model["LDA"]["ASR"]["TRAIN"], | |
206 | - infer_model["LDA"]["ASR"]["DEV"], | |
207 | - infer_model["LDA"]["ASR"]["TEST"], | |
208 | - y_train=infer_model["LDA"]["TRS"]["TRAIN"], | |
209 | - y_dev=infer_model["LDA"]["TRS"]["DEV"], | |
210 | - y_test=infer_model["LDA"]["TRS"]["TEST"], | |
209 | +ft_res = ft_dsae(infer_model[features_key]["ASR"]["TRAIN"], | |
210 | + infer_model[features_key]["ASR"]["DEV"], | |
211 | + infer_model[features_key]["ASR"]["TEST"], | |
212 | + y_train=infer_model[features_key]["TRS"]["TRAIN"], | |
213 | + y_dev=infer_model[features_key]["TRS"]["DEV"], | |
214 | + y_test=infer_model[features_key]["TRS"]["TEST"], | |
211 | 215 | ae_hidden = hidden_size, |
212 | 216 | transfer_hidden = trans_hidden_size, |
213 | 217 | start_weights = WA, |
LDA/04e-mm_vae.py
... | ... | @@ -21,7 +21,12 @@ |
21 | 21 | in_dir = sys.argv[1] |
22 | 22 | #['ASR', 'TRS', 'LABEL'] |
23 | 23 | # In[6]: |
24 | +if len(sys.argv) > 4 : | |
25 | + features_key = sys.argv[4] | |
26 | +else : | |
27 | + features_key = "LDA" | |
24 | 28 | |
29 | +save_projection = True | |
25 | 30 | json_conf =json.load(open(sys.argv[3])) |
26 | 31 | vae_conf = json_conf["vae"] |
27 | 32 | |
28 | 33 | |
... | ... | @@ -63,10 +68,11 @@ |
63 | 68 | |
64 | 69 | name = json_conf["name"] |
65 | 70 | |
66 | - | |
67 | -try: | |
71 | +try : | |
72 | + print "make folder " | |
68 | 73 | os.mkdir("{}/{}".format(in_dir,name)) |
69 | 74 | except: |
75 | + print "folder not maked" | |
70 | 76 | pass |
71 | 77 | |
72 | 78 | |
73 | 79 | |
74 | 80 | |
75 | 81 | |
... | ... | @@ -74,15 +80,16 @@ |
74 | 80 | db["LABEL"]=infer_model["LABEL"] |
75 | 81 | # |
76 | 82 | |
77 | -keys = ["ASR","TRS"] | |
78 | 83 | |
84 | +keys = infer_model[features_key].keys() | |
85 | + | |
79 | 86 | db["VAE"] = {} |
80 | -db["LDA"] = {} | |
87 | +db[features_key] = {} | |
81 | 88 | for mod in keys : |
82 | 89 | #print mod |
83 | - db["LDA"][mod] = train_mlp(infer_model["LDA"][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"], | |
84 | - infer_model["LDA"][mod]["DEV"],infer_model["LABEL"][mod]["DEV"], | |
85 | - infer_model["LDA"][mod]["TEST"],infer_model["LABEL"][mod]["TEST"], | |
90 | + db[features_key][mod] = train_mlp(infer_model[features_key][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"], | |
91 | + infer_model[features_key][mod]["DEV"],infer_model["LABEL"][mod]["DEV"], | |
92 | + infer_model[features_key][mod]["TEST"],infer_model["LABEL"][mod]["TEST"], | |
86 | 93 | mlp_h ,sgd=mlp_sgd, |
87 | 94 | epochs=mlp_epochs, |
88 | 95 | batch_size=mlp_batch_size, |
89 | 96 | |
... | ... | @@ -91,13 +98,26 @@ |
91 | 98 | dropouts=mlp_dropouts, |
92 | 99 | fit_verbose=0) |
93 | 100 | |
94 | - res=train_vae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"], | |
101 | + res=train_vae(infer_model[features_key][mod]["TRAIN"],infer_model[features_key][mod]["DEV"],infer_model[features_key][mod]["TEST"], | |
95 | 102 | hidden_size=hidden_size[0], |
96 | 103 | latent_dim=latent_dim,sgd=sgd, |
97 | 104 | input_activation=input_activation,output_activation=output_activation, |
98 | 105 | nb_epochs=epochs,batch_size=batch) |
99 | 106 | mlp_res_list=[] |
100 | - for layer in res : | |
107 | + for nb,layer in enumerate(res) : | |
108 | + if save_projection: | |
109 | + pd = pandas.DataFrame(layer[0]) | |
110 | + col_count = (pd.sum(axis=0) != 0) | |
111 | + pd = pd.loc[:,cyyol_count] | |
112 | + pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"TRAIN") | |
113 | + pd = pandas.DataFrame(layer[1]) | |
114 | + pd = pd.loc[:,col_count] | |
115 | + pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"DEV") | |
116 | + pd = pandas.DataFrame(layer[2]) | |
117 | + pd = pd.loc[:,col_count] | |
118 | + pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"TEST") | |
119 | + del pd | |
120 | + | |
101 | 121 | mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], |
102 | 122 | layer[1],infer_model["LABEL"][mod]["DEV"], |
103 | 123 | layer[2],infer_model["LABEL"][mod]["TEST"], |
104 | 124 | |
105 | 125 | |
106 | 126 | |
... | ... | @@ -107,32 +127,46 @@ |
107 | 127 | batch_size=mlp_batch_size,fit_verbose=0)) |
108 | 128 | db["VAE"][mod]=mlp_res_list |
109 | 129 | |
110 | -mod = "ASR" | |
111 | -mod2= "TRS" | |
112 | -mlp_res_list=[] | |
130 | +if "ASR" in keys and "TRS" in keys : | |
131 | + mod = "ASR" | |
132 | + mod2= "TRS" | |
133 | + mlp_res_list=[] | |
113 | 134 | |
114 | -res = train_vae(infer_model["LDA"][mod]["TRAIN"], | |
115 | - infer_model["LDA"][mod]["DEV"], | |
116 | - infer_model["LDA"][mod]["TEST"], | |
117 | - hidden_size=hidden_size[0], | |
118 | - sgd=sgd,input_activation=input_activation,output_activation=output_activation, | |
119 | - latent_dim=latent_dim, | |
120 | - nb_epochs=epochs, | |
121 | - batch_size=batch, | |
122 | - y_train=infer_model["LDA"][mod2]["TRAIN"], | |
123 | - y_dev=infer_model["LDA"][mod2]["DEV"], | |
124 | - y_test=infer_model["LDA"][mod2]["TEST"]) | |
135 | + res = train_vae(infer_model[features_key][mod]["TRAIN"], | |
136 | + infer_model[features_key][mod]["DEV"], | |
137 | + infer_model[features_key][mod]["TEST"], | |
138 | + hidden_size=hidden_size[0], | |
139 | + sgd=sgd,input_activation=input_activation,output_activation=output_activation, | |
140 | + latent_dim=latent_dim, | |
141 | + nb_epochs=epochs, | |
142 | + batch_size=batch, | |
143 | + y_train=infer_model[features_key][mod2]["TRAIN"], | |
144 | + y_dev=infer_model[features_key][mod2]["DEV"], | |
145 | + y_test=infer_model[features_key][mod2]["TEST"]) | |
125 | 146 | |
126 | -for layer in res : | |
127 | - mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], | |
128 | - layer[1],infer_model["LABEL"][mod]["DEV"], | |
129 | - layer[2],infer_model["LABEL"][mod]["TEST"], | |
130 | - mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, | |
131 | - output_activation=mlp_output_activation, | |
132 | - input_activation=input_activation, | |
133 | - batch_size=mlp_batch_size,fit_verbose=0)) | |
147 | + for nb,layer in enumerate(res) : | |
148 | + if save_projection: | |
149 | + pd = pandas.DataFrame(layer[0]) | |
150 | + col_count = (pd.sum(axis=0) != 0) | |
151 | + pd = pd.loc[:,col_count] | |
152 | + pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"TRAIN") | |
153 | + pd = pandas.DataFrame(layer[1]) | |
154 | + pd = pd.loc[:,col_count] | |
155 | + pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"DEV") | |
156 | + pd = pandas.DataFrame(layer[2]) | |
157 | + pd = pd.loc[:,col_count] | |
158 | + pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,"SPE"),"TEST") | |
159 | + del pd | |
134 | 160 | |
135 | -db["VAE"]["SPE"] = mlp_res_list | |
161 | + mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], | |
162 | + layer[1],infer_model["LABEL"][mod]["DEV"], | |
163 | + layer[2],infer_model["LABEL"][mod]["TEST"], | |
164 | + mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, | |
165 | + output_activation=mlp_output_activation, | |
166 | + input_activation=input_activation, | |
167 | + batch_size=mlp_batch_size,fit_verbose=0)) | |
168 | + | |
169 | + db["VAE"]["SPE"] = mlp_res_list | |
136 | 170 | |
137 | 171 | db.sync() |
138 | 172 | db.close() |
LDA/05-lts_scoring.py
1 | +import sys | |
2 | +import shelve | |
3 | +import pickle | |
4 | +from utils import * | |
5 | +import sys | |
6 | +import os | |
7 | +import json | |
8 | +import glob | |
9 | +import tempfile | |
10 | +import pandas | |
11 | +import subprocess | |
12 | +from subprocess import CalledProcessError | |
13 | +import shutil | |
14 | +import numpy | |
15 | + | |
16 | +in_dir = sys.argv[1] | |
17 | +json_conf =json.load(open(sys.argv[2])) | |
18 | +name = json_conf["name"] | |
19 | + | |
20 | +ae_m = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name)) | |
21 | +y_train=numpy.argmax(ae_m["LABEL"]["ASR"]["TRAIN"],axis=1) | |
22 | +_,ytr_path=tempfile.mkstemp() | |
23 | +ytr_open= open(ytr_path,"w") | |
24 | +for el in y_train: | |
25 | + print >>ytr_open, el | |
26 | +ytr_open.close() | |
27 | + | |
28 | +y_dev=numpy.argmax(ae_m["LABEL"]["ASR"]["DEV"],axis=1) | |
29 | +_,yd_path=tempfile.mkstemp() | |
30 | +yd_open = open(yd_path,"w") | |
31 | +for el in y_dev: | |
32 | + print >>yd_open, el | |
33 | +yd_open.close() | |
34 | + | |
35 | +y_test=numpy.argmax(ae_m["LABEL"]["ASR"]["TEST"],axis=1) | |
36 | +_,yte_path=tempfile.mkstemp() | |
37 | +yte_open=open(yte_path,"w") | |
38 | +for el in y_test: | |
39 | + print >>yte_open, el | |
40 | +yte_open.close() | |
41 | + | |
42 | +hdfs_files=glob.glob("{}/{}/*.hdf".format(in_dir,name)) | |
43 | +temp_dir=tempfile.mkdtemp() | |
44 | +out_file=open("{}/{}/malaha_res.txt".format(in_dir,name),"a") | |
45 | + | |
46 | +for hdf in hdfs_files: | |
47 | + print >>out_file, "Start ---------------------------------------------------" | |
48 | + print >>out_file, hdf | |
49 | + x_train = pandas.read_hdf(hdf,"TRAIN") | |
50 | + x_train.to_csv("{}/xtrain.dat".format(temp_dir),sep=" ",header=False,index=False, index_label=False) | |
51 | + x_train = pandas.read_hdf(hdf,"DEV") | |
52 | + x_train.to_csv("{}/xdev.dat".format(temp_dir),sep=" ",header=False,index=False, index_label=False) | |
53 | + x_train = pandas.read_hdf(hdf,"TEST") | |
54 | + x_train.to_csv("{}/xtest.dat".format(temp_dir),sep=" ",header=False,index=False, index_label=False) | |
55 | + try : | |
56 | + resdev=subprocess.check_output(['Rscript', | |
57 | + '/home/laboinfo/janod/WorkingDir/erreur_traduction/Author_Topic_Decoda/estimate.R', | |
58 | + "{}/xtrain.dat".format(temp_dir), | |
59 | + "{}/xdev.dat".format(temp_dir), | |
60 | + ytr_path,yd_path]) | |
61 | + | |
62 | + restest=subprocess.check_output(['Rscript', | |
63 | + '/home/laboinfo/janod/WorkingDir/erreur_traduction/Author_Topic_Decoda/estimate.R', | |
64 | + "{}/xtrain.dat".format(temp_dir), | |
65 | + "{}/xtest.dat".format(temp_dir), | |
66 | + ytr_path,yte_path]) | |
67 | + | |
68 | + print >>out_file, resdev | |
69 | + print >>out_file, hdf | |
70 | + print >>out_file, restest | |
71 | + except CalledProcessError: | |
72 | + print >>out_file, "FAILED" | |
73 | + print >>out_file, hdf | |
74 | + print >>out_file, "End ---------------------------------------------------" | |
75 | + | |
76 | +shutil.rmtree(temp_dir) | |
77 | +os.remove(ytr_path) | |
78 | +os.remove(yd_path) | |
79 | +os.remove(yte_path) |
LDA/mlp.py
... | ... | @@ -82,7 +82,6 @@ |
82 | 82 | |
83 | 83 | def train_mlp(x_train,y_train,x_dev,y_dev,x_test,y_test,hidden_size,input_activation="relu",hidden_activation="relu",output_activation="softmax",loss="mse",init="glorot_uniform",dropouts=None,sgd=None,epochs=1200,batch_size=16,fit_verbose=1,test_verbose=0,save_pred=False,keep_histo=False): |
84 | 84 | |
85 | - | |
86 | 85 | layers = [Input(shape=(x_train.shape[1],))] |
87 | 86 | |
88 | 87 | for h in hidden_size: |
LDA/utils.py
1 | 1 | # -*- coding: utf-8 -*- |
2 | 2 | import nltk |
3 | 3 | import re |
4 | +import codecs | |
5 | +import numpy as np | |
6 | +import sqlite3 | |
7 | + | |
4 | 8 | pattern = ur"\d+(?:\.\d+)?\s*%?|\w{1,2}'|<unk>|[\wรฉร รจรนรชรดรปรขรฒรฌรฎรง]+|[^\w\s]" |
5 | 9 | rer_b = re.compile(ur" r e r(?: e r)? b ") |
6 | 10 | rer_c = re.compile(ur" r e r(?: e r)? c |r e r( e r)? c' est | rer c' est") |
... | ... | @@ -43,4 +47,55 @@ |
43 | 47 | |
44 | 48 | def select_mmf(elm): |
45 | 49 | return int(elm.split("_")[0]) |
50 | + | |
51 | +def get_score(table): | |
52 | + mx_train = np.max(table[0]) | |
53 | + argmx_dev = np.argmax(table[1]) | |
54 | + mx_dev = table[1][argmx_dev] | |
55 | + best_test = table[2][argmx_dev] | |
56 | + mx_test = np.max(table[2]) | |
57 | + print """\tmax train : {} | |
58 | + \tmax dev : {} | |
59 | + \tmax test : {} - best test : {} | |
60 | + \t best epochs : {}""".format(mx_train,mx_dev,mx_test,best_test,argmx_dev) | |
61 | + return mx_train,mx_dev,mx_test,best_test,argmx_dev | |
62 | +class WeightedWordsList : | |
63 | + @staticmethod | |
64 | + def get_key(wtuple): | |
65 | + return wtuple[1] | |
66 | + @staticmethod | |
67 | + def get_okey(wtuple): | |
68 | + return wtuple[1][1] | |
69 | + | |
70 | + | |
71 | + def __init__(self,file_path): | |
72 | + self.wlist = codecs.open(file_path,"r","utf8").readlines() | |
73 | + self.wlist = [x.strip().split(':') for x in self.wlist ] | |
74 | + self.wlist = [ (x, float(y)) for x,y in self.wlist ] | |
75 | + self.wdict = {} | |
76 | + for x,y in self.wlist: | |
77 | + self.wdict[x.encode("utf8")] = y | |
78 | + | |
79 | + def select_best(self,word_list,lenght=5): | |
80 | + scored_word = [] | |
81 | + for w in word_list: | |
82 | + w = w.encode("utf8") | |
83 | + if w not in self.wdict : | |
84 | + continue | |
85 | + | |
86 | + if len(scored_word) < lenght: | |
87 | + scored_word.append((w,self.wdict[w])) | |
88 | + else : | |
89 | + w_min= min(enumerate(scored_word),key=WeightedWordsList.get_okey) | |
90 | + w_curr = (w, self.wdict[w]) | |
91 | + if w_min[1][1] < w_curr[1]: | |
92 | + del scored_word[w_min[0]] | |
93 | + scored_word.append(w_curr) | |
94 | + w_min = min(enumerate(scored_word),key=WeightedWordsList.get_okey) | |
95 | + while len(scored_word) > lenght and w_min[1][1] < w_curr[1] : | |
96 | + del scored_word[w_min[0]] | |
97 | + w_min = min(enumerate(scored_word),key=WeightedWordsList.get_okey) | |
98 | + elif w_min[1][1] == w_curr[1]: | |
99 | + scored_word.append(w_curr) | |
100 | + return [ w[0] for w in scored_word ] |
LDA/vae.py
... | ... | @@ -16,15 +16,59 @@ |
16 | 16 | from keras import backend as K |
17 | 17 | from keras import objectives |
18 | 18 | from keras.datasets import mnist |
19 | +from keras.callbacks import EarlyStopping,Callback | |
19 | 20 | |
20 | 21 | import pandas |
21 | 22 | import shelve |
22 | 23 | import pickle |
23 | 24 | |
24 | 25 | |
26 | +class ZeroStopping(Callback): | |
27 | + '''Stop training when a monitored quantity has stopped improving. | |
28 | + # Arguments | |
29 | + monitor: quantity to be monitored. | |
30 | + patience: number of epochs with no improvement | |
31 | + after which training will be stopped. | |
32 | + verbose: verbosity mode. | |
33 | + mode: one of {auto, min, max}. In 'min' mode, | |
34 | + training will stop when the quantity | |
35 | + monitored has stopped decreasing; in 'max' | |
36 | + mode it will stop when the quantity | |
37 | + monitored has stopped increasing. | |
38 | + ''' | |
39 | + def __init__(self, monitor='val_loss', verbose=0, mode='auto', thresh = 0): | |
40 | + super(ZeroStopping, self).__init__() | |
25 | 41 | |
42 | + self.monitor = monitor | |
43 | + self.verbose = verbose | |
44 | + self.thresh = thresh # is a rythme | |
26 | 45 | |
46 | + if mode not in ['auto', 'min', 'max']: | |
47 | + warnings.warn('EarlyStopping mode %s is unknown, ' | |
48 | + 'fallback to auto mode.' % (self.mode), | |
49 | + RuntimeWarning) | |
50 | + mode = 'auto' | |
27 | 51 | |
52 | + if mode == 'min': | |
53 | + self.monitor_op = np.less | |
54 | + elif mode == 'max': | |
55 | + self.monitor_op = np.greater | |
56 | + else: | |
57 | + if 'acc' in self.monitor: | |
58 | + self.monitor_op = np.greater | |
59 | + else: | |
60 | + self.monitor_op = np.less | |
61 | + | |
62 | + def on_epoch_end(self, epoch, logs={}): | |
63 | + current = logs.get(self.monitor) | |
64 | + if current is None: | |
65 | + warnings.warn('Zero stopping requires %s available!' % | |
66 | + (self.monitor), RuntimeWarning) | |
67 | + | |
68 | + if self.monitor_op(current, self.thresh): | |
69 | + self.best = current | |
70 | + self.model.stop_training = True | |
71 | + | |
28 | 72 | #batch_size = 16 |
29 | 73 | #original_dim = 784 |
30 | 74 | #latent_dim = 2 |
31 | 75 | |
... | ... | @@ -82,8 +126,11 @@ |
82 | 126 | vae.fit(x_train, y_train, |
83 | 127 | shuffle=True, |
84 | 128 | nb_epoch=nb_epochs, |
129 | + verbose = 1, | |
85 | 130 | batch_size=batch_size, |
86 | - validation_data=(x_dev, y_dev)) | |
131 | + validation_data=(x_dev, y_dev), | |
132 | + callbacks = [ZeroStopping(monitor='val_loss', thresh=0, verbose=0, mode='min')] | |
133 | + ) | |
87 | 134 | |
88 | 135 | # build a model to project inputs on the latent space |
89 | 136 | encoder = Model(x, z_mean) |