Commit 7db73861ffbab3f3f51b17188d8894a512b36264
1 parent
b6d0165d16
Exists in
master
add vae et mmf
Showing 13 changed files with 1084 additions and 44 deletions Inline Diff
LDA/00-mmf_make_features.py
File was created | 1 | import sys | |
2 | import os | ||
3 | |||
4 | import pandas | ||
5 | import numpy | ||
6 | import shelve | ||
7 | |||
8 | from sklearn.preprocessing import LabelBinarizer | ||
9 | |||
10 | from utils import select_mmf as select | ||
11 | |||
12 | input_dir = sys.argv[1] # Dossier de premire niveau contient ASR et TRS | ||
13 | level = sys.argv[2] # taille de LDA ( -5) voulu | ||
14 | |||
15 | lb=LabelBinarizer() | ||
16 | #y_train=lb.fit_transform([utils.select(ligneid) for ligneid in origin_corps["LABEL"]["TRAIN"]]) | ||
17 | |||
18 | |||
19 | data = shelve.open("{}/mmf_{}.shelve".format(input_dir,level)) | ||
20 | data["LABEL"]= {"LDA":{}} | ||
21 | for mod in ["ASR", "TRS" ] | ||
22 | train = pandas.read_table("{}/{}/train_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | ||
23 | dev = pandas.read_table("{}/{}/dev_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | ||
24 | test = pandas.read_table("{}/{}/test_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | ||
25 | |||
26 | y_train = train.iloc[:,0].apply(select) | ||
27 | y_dev = dev.iloc[:,0].apply(select) | ||
28 | y_test = test.iloc[:,0].apply(select) | ||
29 | lb.fit(y_train) | ||
30 | data["LABEL"][mod]={"TRAIN":lb.transform(y_train),"DEV":lb.transform(y_dev), "TEST": lb.transform(y_test)} | ||
31 | |||
32 | data["LDA"][mod]={} | ||
33 | data["LDA"][mod]["TRAIN"]=train.iloc[:,1:].values | ||
34 | data["LDA"][mod]["DEV"]=dev.iloc[:,1:].values | ||
35 | data["LDA"][mod]["TEST"]=test.iloc[:,1:].values | ||
36 | |||
37 | data.sync() | ||
38 | data.close() | ||
39 |
LDA/02-lda.py
1 | import gensim | 1 | import gensim |
2 | import os | 2 | import os |
3 | import sys | 3 | import sys |
4 | import pickle | 4 | import pickle |
5 | from gensim.models.ldamodel import LdaModel | 5 | from gensim.models.ldamodel import LdaModel |
6 | from gensim.models.ldamulticore import LdaMulticore | 6 | from gensim.models.ldamulticore import LdaMulticore |
7 | from collections import Counter | 7 | from collections import Counter |
8 | import numpy as np | 8 | import numpy as np |
9 | import codecs | 9 | import codecs |
10 | import shelve | 10 | import shelve |
11 | import logging | 11 | import logging |
12 | import dill | 12 | import dill |
13 | from tinydb import TinyDB, where, Query | 13 | from tinydb import TinyDB, where, Query |
14 | import time | 14 | import time |
15 | from joblib import Parallel, delayed | ||
15 | 16 | ||
16 | def calc_perp(models,train): | 17 | def calc_perp(models,train): |
17 | 18 | ||
18 | 19 | ||
19 | stop_words=models[1] | 20 | stop_words=models[1] |
20 | name = models[0] | 21 | name = models[0] |
21 | 22 | ||
22 | logging.warning(" go {} ".format(name)) | 23 | logging.warning(" go {} ".format(name)) |
23 | logging.warning("TRS to be done") | 24 | logging.warning("TRS to be done") |
24 | entry = Query() | 25 | entry = Query() |
25 | value=db.search(entry.name == name) | 26 | value=db.search(entry.name == name) |
26 | if len(value) > 0 : | 27 | if len(value) > 0 : |
27 | logging.warning("{} already done".format(name)) | 28 | logging.warning("{} already done".format(name)) |
28 | return | 29 | return |
29 | 30 | ||
30 | dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] | 31 | dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] |
31 | lda_trs = models[2] | 32 | lda_trs = models[2] |
32 | perp_trs = lda_trs.log_perplexity(dev_trs) | 33 | perp_trs = lda_trs.log_perplexity(dev_trs) |
33 | 34 | ||
34 | logging.warning("ASR to be done") | 35 | logging.warning("ASR to be done") |
35 | dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] | 36 | dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] |
36 | lda_asr = models[5] | 37 | lda_asr = models[5] |
37 | perp_asr = lda_asr.log_perplexity(dev_asr) | 38 | perp_asr = lda_asr.log_perplexity(dev_asr) |
38 | logging.warning("ASR saving") | 39 | logging.warning("ASR saving") |
39 | res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs } | 40 | res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs } |
40 | return res_dict | 41 | return res_dict |
41 | 42 | ||
42 | 43 | ||
43 | 44 | ||
44 | 45 | ||
45 | def train_lda(out_dir,train,size,it,sw_size,alpha,eta,passes,chunk): | 46 | def train_lda(out_dir,train,size,it,sw_size,alpha,eta,passes,chunk): |
46 | name = "s{}_it{}_sw{}_a{}_e{}_p{}_c{}".format(size,it,sw_size,alpha,eta,passes,chunk) | 47 | name = "s{}_it{}_sw{}_a{}_e{}_p{}_c{}".format(size,it,sw_size,alpha,eta,passes,chunk) |
47 | logging.warning(name) | 48 | logging.warning(name) |
48 | if os.path.isfile(out_dir+"/"+name+".dill"): | 49 | deep_out_dir = out_dir+"/"+name |
50 | if os.path.isdir(deep_out_dir): | ||
49 | logging.error(name+" already done") | 51 | logging.error(name+" already done") |
50 | return | 52 | return |
51 | logging.warning(name+" to be done") | 53 | logging.warning(name+" to be done") |
52 | asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) | 54 | asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) |
53 | trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) | 55 | trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) |
54 | asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] | 56 | asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] |
55 | trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] | 57 | trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] |
56 | stop_words=set(asr_sw) | set(trs_sw) | 58 | stop_words=set(asr_sw) | set(trs_sw) |
57 | stop_words=[ x.strip() for x in open("french.txt").readlines() ] | ||
58 | 59 | ||
59 | logging.warning("TRS to be done") | 60 | logging.warning("TRS to be done") |
60 | 61 | ||
61 | lda_trs = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=chunk,iterations=it,alpha=alpha,eta=eta,passes=passes) | 62 | lda_trs = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=chunk,iterations=it,alpha=alpha,eta=eta,passes=passes) |
62 | 63 | ||
63 | logging.warning("ASR to be done") | 64 | logging.warning("ASR to be done") |
64 | lda_asr = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=chunk,iterations=it,alpha=alpha,eta=eta,passes=passes) | 65 | lda_asr = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=chunk,iterations=it,alpha=alpha,eta=eta,passes=passes) |
65 | 66 | ||
66 | dico = train["vocab"] | 67 | dico = train["vocab"] |
67 | word_list = [ dico[x] for x in range(len(train["vocab"]))] | 68 | word_list = [ dico[x] for x in range(len(train["vocab"]))] |
68 | asr_probs = [] | 69 | asr_probs = [] |
69 | for line in lda_asr.expElogbeta: | 70 | for line in lda_asr.expElogbeta: |
70 | nline = line / np.sum(line) | 71 | nline = line / np.sum(line) |
71 | asr_probs.append( str(x) for x in nline) | 72 | asr_probs.append([ str(x) for x in nline]) |
72 | trs_probs = [] | 73 | trs_probs = [] |
73 | for line in lda_trs.expElogbeta: | 74 | for line in lda_trs.expElogbeta: |
74 | nline = line / np.sum(line) | 75 | nline = line / np.sum(line) |
75 | trs_probs.append( str(x) for x in nline) | 76 | trs_probs.append([str(x) for x in nline]) |
76 | 77 | ||
77 | K = lda_asr.num_topics | 78 | K = lda_asr.num_topics |
78 | topicWordProbMat_asr = lda_asr.print_topics(K,10) | 79 | topicWordProbMat_asr = lda_asr.print_topics(K,10) |
79 | 80 | ||
80 | K = lda_trs.num_topics | 81 | K = lda_trs.num_topics |
81 | topicWordProbMat_trs = lda_trs.print_topics(K,10) | 82 | topicWordProbMat_trs = lda_trs.print_topics(K,10) |
83 | os.mkdir(deep_out_dir) | ||
84 | dill.dump([x for x in stop_words],open(deep_out_dir+"/stopwords.dill","w")) | ||
85 | lda_asr.save(deep_out_dir+"/lda_asr.model") | ||
86 | lda_trs.save(deep_out_dir+"/lda_trs.model") | ||
87 | dill.dump([x for x in asr_probs],open(deep_out_dir+"/lda_asr_probs.dill","w")) | ||
88 | dill.dump([x for x in trs_probs],open(deep_out_dir+"/lda_trs_probs.dill","w")) | ||
89 | |||
82 | return [name, stop_words, lda_asr , asr_probs , topicWordProbMat_asr, lda_trs, trs_probs, topicWordProbMat_trs] | 90 | return [name, stop_words, lda_asr , asr_probs , topicWordProbMat_asr, lda_trs, trs_probs, topicWordProbMat_trs] |
83 | 91 | ||
92 | def train_one(name,train,s,i,sw,a,e,p,c): | ||
93 | st=time.time() | ||
94 | logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]])) | ||
95 | models = train_lda(name,train,s,i,sw,a,e,p,c) | ||
96 | if models: | ||
97 | m = calc_perp(models,train) | ||
98 | #dill.dump(models,open("{}/{}.dill".format(name,models[0]),"wb")) | ||
99 | else : | ||
100 | m = None | ||
101 | e = time.time() | ||
102 | logging.warning("fin en : {}".format(e-st)) | ||
103 | return m | ||
104 | |||
105 | |||
106 | |||
107 | |||
84 | if __name__ == "__main__": | 108 | if __name__ == "__main__": |
85 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) | 109 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) |
86 | 110 | ||
87 | input_shelve = sys.argv[1] | 111 | input_shelve = sys.argv[1] |
88 | db_path = sys.argv[2] | 112 | db_path = sys.argv[2] |
89 | size = [ int(x) for x in sys.argv[3].split("_")] | 113 | size = [ int(x) for x in sys.argv[3].split("_")] |
90 | workers = int(sys.argv[4]) | 114 | workers = int(sys.argv[4]) |
91 | name = sys.argv[5] | 115 | name = sys.argv[5] |
92 | it = [ int(x) for x in sys.argv[6].split("_")] | 116 | it = [ int(x) for x in sys.argv[6].split("_")] |
93 | sw_size = [ int(x) for x in sys.argv[7].split("_")] | 117 | sw_size = [ int(x) for x in sys.argv[7].split("_")] |
94 | if sys.argv[8] != "None" : | 118 | if sys.argv[8] != "None" : |
95 | alpha = [ "symmetric", "auto" ] + [ float(x) for x in sys.argv[8].split("_")] | 119 | alpha = [ "symmetric", "auto" ] + [ float(x) for x in sys.argv[8].split("_")] |
96 | eta = ["auto"] + [ float(x) for x in sys.argv[9].split("_")] | 120 | eta = ["auto"] + [ float(x) for x in sys.argv[9].split("_")] |
97 | else : | 121 | else : |
98 | alpha = ["symmetric"] | 122 | alpha = ["symmetric"] |
99 | eta = ["auto"] | 123 | eta = ["auto"] |
100 | passes = [ int(x) for x in sys.argv[10].split("_")] | 124 | passes = [ int(x) for x in sys.argv[10].split("_")] |
101 | chunk = [ int(x) for x in sys.argv[11].split("_")] | 125 | chunk = [ int(x) for x in sys.argv[11].split("_")] |
102 | 126 | ||
103 | #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir))) | 127 | #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir))) |
104 | train = shelve.open(input_shelve) | 128 | train = shelve.open(input_shelve) |
105 | try : | 129 | try : |
106 | os.mkdir(name) | 130 | os.mkdir(name) |
107 | except : | 131 | except : |
108 | logging.warning(" folder already existe " ) | 132 | logging.warning(" folder already existe " ) |
109 | db = TinyDB(db_path) | 133 | db = TinyDB(db_path) |
110 | nb_model = len(passes) * len(chunk) * len(it) * len(sw_size) * len(alpha) * len(eta) * len(size) | 134 | nb_model = len(passes) * len(chunk) * len(it) * len(sw_size) * len(alpha) * len(eta) * len(size) |
111 | logging.warning(" hey will train {} models ".format(nb_model)) | 135 | logging.warning(" hey will train {} models ".format(nb_model)) |
136 | |||
137 | args_list=[] | ||
112 | for p in passes: | 138 | for p in passes: |
113 | for c in chunk: | 139 | for c in chunk: |
114 | for i in it : | 140 | for i in it : |
115 | for sw in sw_size: | 141 | for sw in sw_size: |
116 | for a in alpha: | 142 | for a in alpha: |
117 | for e in eta: | 143 | for e in eta: |
118 | for s in size: | 144 | for s in size: |
119 | st=time.time() | 145 | args_list.append((name,train,s,i,sw,a,e,p,c)) |
120 | logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]])) | 146 | res_list= Parallel(n_jobs=15)(delayed(train_one)(*args) for args in args_list) |
121 | models = train_lda(name,train,s,i,sw,a,e,p,c) | 147 | for m in res_list : |
122 | if models: | 148 | db.insert(m) |
123 | m = calc_perp(models,train) | 149 |
LDA/03-mono_perplex.py
1 | import gensim | 1 | import gensim |
2 | import time | 2 | import time |
3 | import os | 3 | import os |
4 | import sys | 4 | import sys |
5 | import pickle | 5 | import pickle |
6 | from gensim.models.ldamodel import LdaModel | 6 | from gensim.models.ldamodel import LdaModel |
7 | from gensim.models.ldamulticore import LdaMulticore | 7 | from gensim.models.ldamulticore import LdaMulticore |
8 | from collections import Counter | 8 | from collections import Counter |
9 | import numpy as np | 9 | import numpy as np |
10 | import codecs | 10 | import codecs |
11 | import shelve | 11 | import shelve |
12 | import logging | 12 | import logging |
13 | import glob | 13 | import glob |
14 | from tinydb import TinyDB, where, Query | 14 | from tinydb import TinyDB, where, Query |
15 | 15 | ||
16 | 16 | ||
17 | def calc_perp(in_dir,train): | 17 | def calc_perp(in_dir,train): |
18 | name = in_dir.split("/")[-1] | 18 | name = in_dir.split("/")[-1] |
19 | # s40_it1_sw50_a0.01_e0.1_p6_c1000 | 19 | # s40_it1_sw50_a0.01_e0.1_p6_c1000 |
20 | sw_size = int(name.split("_")[2][2:]) | 20 | sw_size = int(name.split("_")[2][2:]) |
21 | 21 | ||
22 | logging.warning(" go {} ".format(name)) | 22 | logging.warning(" go {} ".format(name)) |
23 | 23 | ||
24 | 24 | ||
25 | logging.warning("Redo Vocab and stop") | 25 | logging.warning("Redo Vocab and stop") |
26 | asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) | 26 | asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) |
27 | trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) | 27 | trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) |
28 | asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] | 28 | asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] |
29 | trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] | 29 | trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] |
30 | stop_words=set(asr_sw) | set(trs_sw) | 30 | stop_words=set(asr_sw) | set(trs_sw) |
31 | 31 | ||
32 | logging.warning("TRS to be done") | 32 | logging.warning("TRS to be done") |
33 | entry = Query() | 33 | entry = Query() |
34 | value=db.search(entry.name == name) | 34 | value=db.search(entry.name == name) |
35 | if len(value) > 0 : | 35 | if len(value) > 0 : |
36 | logging.warning("{} already done".format(name)) | 36 | logging.warning("{} already done".format(name)) |
37 | return | 37 | return |
38 | 38 | ||
39 | dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] | 39 | dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] |
40 | lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir)) | 40 | lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir)) |
41 | perp_trs = lda_trs.log_perplexity(dev_trs) | 41 | perp_trs = lda_trs.log_perplexity(dev_trs) |
42 | logging.warning("ASR to be done") | 42 | logging.warning("ASR to be done") |
43 | dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] | 43 | dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] |
44 | lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir)) | 44 | lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir)) |
45 | perp_asr = lda_asr.log_perplexity(dev_asr) | 45 | perp_asr = lda_asr.log_perplexity(dev_asr) |
46 | logging.warning("ASR saving") | 46 | logging.warning("ASR saving") |
47 | res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs} | 47 | res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs} |
48 | return res_dict | 48 | return res_dict |
49 | 49 | ||
50 | if __name__ == "__main__": | 50 | if __name__ == "__main__": |
51 | input_shelve = sys.argv[1] | 51 | input_shelve = sys.argv[1] |
52 | input_dir = sys.argv[2] | 52 | input_dir = sys.argv[2] |
53 | db_path = sys.argv[3] | 53 | db_path = sys.argv[3] |
54 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) | 54 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) |
55 | folders = glob.glob("{}/*".format(input_dir)) | 55 | folders = glob.glob("{}/s*".format(input_dir)) |
56 | 56 | ||
57 | #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir))) | 57 | #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir))) |
58 | train = shelve.open(input_shelve) | 58 | train = shelve.open(input_shelve) |
59 | db = TinyDB(db_path) | 59 | db = TinyDB(db_path) |
60 | for indx, folder in enumerate(folders) : | 60 | for indx, folder in enumerate(folders) : |
61 | s = time.time() | 61 | s = time.time() |
62 | r=calc_perp(folder,train) | 62 | r=calc_perp(folder,train) |
63 | if r : | 63 | if r : |
64 | db.insert(r) | 64 | db.insert(r) |
65 | e = time.time() | 65 | e = time.time() |
66 | print "FIN : {} {} : {}".format(folder,indx,e-s) | 66 | print "FIN : {} {} : {}".format(folder,indx,e-s) |
67 | 67 |
LDA/03-perplex.py
1 | import gensim | 1 | import gensim |
2 | import time | 2 | import time |
3 | import os | 3 | import os |
4 | import sys | 4 | import sys |
5 | import pickle | 5 | import pickle |
6 | from gensim.models.ldamodel import LdaModel | 6 | from gensim.models.ldamodel import LdaModel |
7 | from gensim.models.ldamulticore import LdaMulticore | 7 | from gensim.models.ldamulticore import LdaMulticore |
8 | from collections import Counter | 8 | from collections import Counter |
9 | import numpy as np | 9 | import numpy as np |
10 | import codecs | 10 | import codecs |
11 | import shelve | 11 | import shelve |
12 | import logging | 12 | import logging |
13 | import glob | 13 | import glob |
14 | from tinydb import TinyDB, where, Query | 14 | from tinydb import TinyDB, where, Query |
15 | from itertools import izip_longest, repeat | 15 | from itertools import izip_longest, repeat |
16 | from multiprocessing import Pool | 16 | from multiprocessing import Pool |
17 | 17 | ||
18 | def grouper(n, iterable, fillvalue=None): | 18 | def grouper(n, iterable, fillvalue=None): |
19 | "grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx" | 19 | "grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx" |
20 | args = [iter(iterable)] * n | 20 | args = [iter(iterable)] * n |
21 | return izip_longest(fillvalue=fillvalue, *args) | 21 | return izip_longest(fillvalue=fillvalue, *args) |
22 | 22 | ||
23 | 23 | ||
24 | def calc_perp(params): | 24 | def calc_perp(params): |
25 | in_dir,train = params | 25 | try: |
26 | name = in_dir.split("/")[-1] | 26 | in_dir,train = params |
27 | # s40_it1_sw50_a0.01_e0.1_p6_c1000 | 27 | name = in_dir.split("/")[-1] |
28 | # s40_it1_sw50_a0.01_e0.1_p6_c1000 | ||
28 | 29 | ||
29 | entry = Query() | 30 | entry = Query() |
30 | value=db.search(entry.name == name) | 31 | value=db.search(entry.name == name) |
31 | if len(value) > 0 : | 32 | if len(value) > 0 : |
32 | logging.warning("{} already done".format(name)) | 33 | logging.warning("{} already done".format(name)) |
33 | return | 34 | return |
34 | 35 | ||
35 | sw_size = int(name.split("_")[2][2:]) | 36 | sw_size = int(name.split("_")[2][2:]) |
36 | 37 | ||
37 | logging.warning(" go {} ".format(name)) | 38 | logging.warning(" go {} ".format(name)) |
38 | 39 | ||
39 | 40 | ||
40 | logging.warning("Redo Vocab and stop") | 41 | logging.warning("Redo Vocab and stop") |
41 | asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) | 42 | asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) |
42 | trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) | 43 | trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) |
43 | asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] | 44 | asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] |
44 | trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] | 45 | trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] |
45 | stop_words=set(asr_sw) | set(trs_sw) | 46 | stop_words=set(asr_sw) | set(trs_sw) |
46 | 47 | ||
47 | logging.warning("TRS to be done") | 48 | logging.warning("TRS to be done") |
48 | 49 | ||
49 | dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] | 50 | dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] |
50 | lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir)) | 51 | lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir)) |
51 | perp_trs = lda_trs.log_perplexity(dev_trs) | 52 | perp_trs = lda_trs.log_perplexity(dev_trs) |
52 | logging.warning("ASR to be done") | 53 | logging.warning("ASR to be done") |
53 | dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] | 54 | dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] |
54 | lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir)) | 55 | lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir)) |
55 | perp_asr = lda_asr.log_perplexity(dev_asr) | 56 | perp_asr = lda_asr.log_perplexity(dev_asr) |
56 | logging.warning("ASR saving") | 57 | logging.warning("ASR saving") |
57 | res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs} | 58 | res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs} |
58 | return res_dict | 59 | return res_dict |
60 | except : | ||
61 | return { "name" : name } | ||
59 | 62 | ||
60 | if __name__ == "__main__": | 63 | if __name__ == "__main__": |
61 | input_shelve = sys.argv[1] | 64 | input_shelve = sys.argv[1] |
62 | input_dir = sys.argv[2] | 65 | input_dir = sys.argv[2] |
63 | db_path = sys.argv[3] | 66 | db_path = sys.argv[3] |
64 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) | 67 | logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) |
65 | folders = glob.glob("{}/*".format(input_dir)) | 68 | folders = glob.glob("{}/*".format(input_dir)) |
66 | 69 | ||
67 | #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir))) | 70 | #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir))) |
68 | train = dict(shelve.open(input_shelve)) | 71 | train = dict(shelve.open(input_shelve)) |
69 | db = TinyDB(db_path) | 72 | db = TinyDB(db_path) |
70 | names = [ x["name"] for x in db.all()] | 73 | names = [ x["name"] for x in db.all()] |
71 | p = Pool(processes=14,maxtasksperchild=10) | 74 | p = Pool(processes=14,maxtasksperchild=10) |
72 | 75 | ||
73 | s = time.time() | 76 | s = time.time() |
74 | perplexs = p.map(calc_perp,zip(folders,repeat(train,len(folders)))) | 77 | perplexs = p.map(calc_perp,zip(folders,repeat(train,len(folders)))) |
75 | 78 | ||
76 | for indx, perp in enumerate(perplexs) : | 79 | for indx, perp in enumerate(perplexs) : |
77 | if perp : | 80 | if perp : |
78 | db.insert(perp) | 81 | db.insert(perp) |
79 | e = time.time() | 82 | e = time.time() |
80 | print "FIN : {} : {}".format(indx,e-s) | 83 | print "FIN : {} : {}".format(indx,e-s) |
81 | 84 |
LDA/04a-mmdf.py
File was created | 1 | ||
2 | # coding: utf-8 | ||
3 | |||
4 | # In[29]: | ||
5 | |||
6 | # Import | ||
7 | import itertools | ||
8 | import shelve | ||
9 | import pickle | ||
10 | import numpy | ||
11 | import scipy | ||
12 | from scipy import sparse | ||
13 | import scipy.sparse | ||
14 | import scipy.io | ||
15 | from mlp import * | ||
16 | import mlp | ||
17 | import sys | ||
18 | import utils | ||
19 | import dill | ||
20 | from collections import Counter | ||
21 | from gensim.models import LdaModel | ||
22 | |||
23 | |||
24 | |||
25 | # In[3]: | ||
26 | |||
27 | #30_50_50_150_0.0001 | ||
28 | |||
29 | # In[4]: | ||
30 | |||
31 | #db=shelve.open("SPELIKE_MLP_DB.shelve",writeback=True) | ||
32 | origin_corps=shelve.open("{}".format(sys.argv[2])) | ||
33 | in_dir = sys.argv[1] | ||
34 | |||
35 | |||
36 | out_db=shelve.open("{}/mlp_scores.shelve".format(in_dir),writeback=True) | ||
37 | |||
38 | mlp_h = [ 250, 250 ] | ||
39 | mlp_loss = "categorical_crossentropy" | ||
40 | mlp_dropouts = [0.25]* len(mlp_h) | ||
41 | mlp_sgd = Adam(lr=0.0001) | ||
42 | mlp_epochs = 3000 | ||
43 | mlp_batch_size = 1 | ||
44 | mlp_input_activation = "relu" | ||
45 | mlp_output_activation="softmax" | ||
46 | |||
47 | ress = [] | ||
48 | for key in ["TRS", "ASR"] : | ||
49 | |||
50 | res=mlp.train_mlp(origin_corps["LDA"][key]["TRAIN"],origin_corps["LABEL"][key]["TRAIN"], | ||
51 | origin_corps["LDA"][key]["DEV"],origin_corps["LABEL"][key]["DEV"], | ||
52 | origin_corps["LDA"][key]["TEST"],origin_corps["LABEL"][key]["TEST"], | ||
53 | mlp_h,dropouts=mlp_dropouts,sgd=mlp_sgd, | ||
54 | epochs=mlp_epochs, | ||
55 | batch_size=mlp_batch_size, | ||
56 | save_pred=False,keep_histo=False, | ||
57 | loss="categorical_crossentropy",fit_verbose=0) | ||
58 | arg_best=[] | ||
59 | dev_best=[] | ||
60 | arg_best.append(numpy.argmax(res[1])) | ||
61 | dev_best.append(res[1][arg_best[-1]]) | ||
62 | res[1][arg_best[-1]]=0 | ||
63 | arg_best.append(numpy.argmax(res[1])) | ||
64 | dev_best.append(res[1][arg_best[-1]]) | ||
65 | res[1][arg_best[-1]]=0 | ||
66 | arg_best.append(numpy.argmax(res[1])) | ||
67 | dev_best.append(res[1][arg_best[-1]]) | ||
68 | res[1][arg_best[-1]]=0 | ||
69 | arg_best.append(numpy.argmax(res[1])) | ||
70 | dev_best.append(res[1][arg_best[-1]]) | ||
71 | res[1][arg_best[-1]]=0 | ||
72 | arg_best.append(numpy.argmax(res[1])) | ||
73 | dev_best.append(res[1][arg_best[-1]]) | ||
74 | res[1][arg_best[-1]]=0 | ||
75 | arg_best.append(numpy.argmax(res[1])) | ||
76 | dev_best.append(res[1][arg_best[-1]]) | ||
77 | res[1][arg_best[-1]]=0 | ||
78 | arg_best.append(numpy.argmax(res[1])) | ||
79 | dev_best.append(res[1][arg_best[-1]]) | ||
80 | res[1][arg_best[-1]]=0 | ||
81 | arg_best.append(numpy.argmax(res[1])) | ||
82 | dev_best.append(res[1][arg_best[-1]]) | ||
83 | res[1][arg_best[-1]]=0 | ||
84 | arg_best.append(numpy.argmax(res[1])) | ||
85 | dev_best.append(res[1][arg_best[-1]]) | ||
86 | res[1][arg_best[-1]]=0 | ||
87 | arg_best.append(numpy.argmax(res[1])) | ||
88 | dev_best.append(res[1][arg_best[-1]]) | ||
89 | res[1][arg_best[-1]]=0 | ||
90 | arg_best.append(numpy.argmax(res[1])) | ||
91 | dev_best.append(res[1][arg_best[-1]]) | ||
92 | res[1][arg_best[-1]]=0 | ||
93 | arg_best.append(numpy.argmax(res[1])) | ||
94 | dev_best.append(res[1][arg_best[-1]]) | ||
95 | res[1][arg_best[-1]]=0 | ||
96 | |||
97 | |||
98 | |||
99 | |||
100 | test_best =[ res[2][x] for x in arg_best ] | ||
101 | test_max = numpy.max(res[2]) | ||
102 | out_db[key]=(res,(dev_best,test_best,test_max)) | ||
103 | ress.append((key,dev_best,test_best,test_max)) | ||
104 | |||
105 | for el in ress : | ||
106 | print el | ||
107 | out_db.close() | ||
108 | origin_corps.close() | ||
109 |
LDA/04b-mmf_mini_ae.py
File was created | 1 | ||
2 | # coding: utf-8 | ||
3 | |||
4 | # In[2]: | ||
5 | |||
6 | # Import | ||
7 | import gensim | ||
8 | from scipy import sparse | ||
9 | import itertools | ||
10 | from sklearn import preprocessing | ||
11 | from keras.models import Sequential | ||
12 | from keras.optimizers import SGD,Adam | ||
13 | from mlp import * | ||
14 | import sklearn.metrics | ||
15 | import shelve | ||
16 | import pickle | ||
17 | from utils import * | ||
18 | import sys | ||
19 | import os | ||
20 | import json | ||
21 | # In[4]: | ||
22 | |||
23 | infer_model=shelve.open("{}".format(sys.argv[2])) | ||
24 | in_dir = sys.argv[1] | ||
25 | #['ASR', 'TRS', 'LABEL'] | ||
26 | # In[6]: | ||
27 | |||
28 | |||
29 | hidden_size=[ 100 , 50, 100 ] | ||
30 | input_activation="tanh" | ||
31 | output_activation="tanh" | ||
32 | loss="mse" | ||
33 | epochs=1000 | ||
34 | batch=1 | ||
35 | patience=60 | ||
36 | do_do=[False] | ||
37 | sgd = Adam(lr=0.000001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | ||
38 | |||
39 | |||
40 | |||
41 | mlp_h = [ 150 ,150 ,150 ] | ||
42 | mlp_loss = "categorical_crossentropy" | ||
43 | mlp_dropouts = [] | ||
44 | mlp_sgd = Adam(lr=0.0001) | ||
45 | mlp_epochs = 2000 | ||
46 | mlp_batch_size = 8 | ||
47 | mlp_output_activation="softmax" | ||
48 | |||
49 | try : | ||
50 | sgd_repr=sgd.get_config()["name"] | ||
51 | except AttributeError : | ||
52 | sgd_repr=sgd | ||
53 | |||
54 | try : | ||
55 | mlp_sgd_repr=mlp_sgd.get_config()["name"] | ||
56 | except AttributeError : | ||
57 | mlp_sgd_repr=mlp_sgd | ||
58 | |||
59 | |||
60 | params={ "h1" : "_".join([ str(x) for x in hidden_size ]), | ||
61 | "inside_activation" : input_activation, | ||
62 | "output_activation" : output_activation, | ||
63 | "do_dropout": "_".join([str(x) for x in do_do]), | ||
64 | "loss" : loss, | ||
65 | "epochs" : epochs , | ||
66 | "batch_size" : batch, | ||
67 | "patience" : patience, | ||
68 | "sgd" : sgd_repr, | ||
69 | "mlp_h ": "_".join([str(x) for x in mlp_h]), | ||
70 | "mlp_loss ": mlp_loss, | ||
71 | "mlp_dropouts ": "_".join([str(x) for x in mlp_dropouts]), | ||
72 | "mlp_sgd ": mlp_sgd_repr, | ||
73 | "mlp_epochs ": mlp_epochs, | ||
74 | "mlp_batch_size ": mlp_batch_size, | ||
75 | "mlp_output" : mlp_output_activation | ||
76 | } | ||
77 | name = "_".join([ str(x) for x in params.values()]) | ||
78 | try: | ||
79 | os.mkdir("{}/{}".format(in_dir,name)) | ||
80 | except: | ||
81 | pass | ||
82 | db = shelve.open("{}/{}/ae_model.shelve".format(in_dir,name),writeback=True) | ||
83 | db["params"] = params | ||
84 | db["LABEL"]=infer_model["LABEL"] | ||
85 | # | ||
86 | json.dump(params, | ||
87 | open("{}/{}/ae_model.json".format(in_dir,name),"w"), | ||
88 | indent=4) | ||
89 | |||
90 | keys = ["ASR","TRS"] | ||
91 | |||
92 | db["AE"] = {} | ||
93 | db["LDA"] = {} | ||
94 | for mod in keys : | ||
95 | print mod | ||
96 | db["LDA"][mod] = train_mlp(infer_model["LDA"][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"], | ||
97 | infer_model["LDA"][mod]["DEV"],infer_model["LABEL"][mod]["DEV"], | ||
98 | infer_model["LDA"][mod]["TEST"],infer_model["LABEL"][mod]["TEST"], | ||
99 | mlp_h ,sgd=mlp_sgd, | ||
100 | epochs=mlp_epochs, | ||
101 | batch_size=mlp_batch_size, | ||
102 | input_activation=input_activation, | ||
103 | output_activation=mlp_output_activation, | ||
104 | dropouts=mlp_dropouts, | ||
105 | fit_verbose=0) | ||
106 | |||
107 | res=train_ae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"], | ||
108 | hidden_size,patience = params["patience"],sgd=sgd, | ||
109 | dropouts=do_do,input_activation=input_activation,output_activation=output_activation, | ||
110 | loss=loss,epochs=epochs,batch_size=batch,verbose=0) | ||
111 | mlp_res_list=[] | ||
112 | for layer in res : | ||
113 | mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], | ||
114 | layer[1],infer_model["LABEL"][mod]["DEV"], | ||
115 | layer[2],infer_model["LABEL"][mod]["TEST"], | ||
116 | mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, | ||
117 | output_activation=mlp_output_activation, | ||
118 | input_activation=input_activation, | ||
119 | batch_size=mlp_batch_size,fit_verbose=0)) | ||
120 | db["AE"][mod]=mlp_res_list | ||
121 | |||
122 | mod = "ASR" | ||
123 | mod2= "TRS" | ||
124 | mlp_res_list=[] | ||
125 | |||
126 | res = train_ae(infer_model["LDA"][mod]["TRAIN"], | ||
127 | infer_model["LDA"][mod]["DEV"], | ||
128 | infer_model["LDA"][mod]["TEST"], | ||
129 | hidden_size,dropouts=do_do,patience = params["patience"], | ||
130 | sgd=sgd,input_activation=input_activation,output_activation=output_activation,loss=loss,epochs=epochs, | ||
131 | batch_size=batch, | ||
132 | y_train=infer_model["LDA"][mod]["TRAIN"], | ||
133 | y_dev=infer_model["LDA"][mod2]["DEV"], | ||
134 | y_test=infer_model["LDA"][mod2]["TEST"]) | ||
135 | |||
136 | for layer in res : | ||
137 | mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], | ||
138 | layer[1],infer_model["LABEL"][mod]["DEV"], | ||
139 | layer[2],infer_model["LABEL"][mod]["TEST"], | ||
140 | mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, | ||
141 | output_activation=mlp_output_activation, | ||
142 | input_activation=input_activation, | ||
143 | batch_size=mlp_batch_size,fit_verbose=0)) | ||
144 | |||
145 | db["AE"]["SPE"] = mlp_res_list | ||
146 | |||
147 | db.sync() | ||
148 | db.close() | ||
149 |
LDA/04c-mmf_sae.py
File was created | 1 | ||
2 | # coding: utf-8 | ||
3 | |||
4 | # In[2]: | ||
5 | |||
6 | # Import | ||
7 | import gensim | ||
8 | from scipy import sparse | ||
9 | import itertools | ||
10 | from sklearn import preprocessing | ||
11 | from keras.models import Sequential | ||
12 | from keras.optimizers import SGD,Adam | ||
13 | from mlp import * | ||
14 | import mlp | ||
15 | import sklearn.metrics | ||
16 | import shelve | ||
17 | import pickle | ||
18 | from utils import * | ||
19 | import sys | ||
20 | import os | ||
21 | import json | ||
22 | # In[4]: | ||
23 | |||
24 | infer_model=shelve.open("{}".format(sys.argv[2])) | ||
25 | in_dir = sys.argv[1] | ||
26 | #['ASR', 'TRS', 'LABEL'] | ||
27 | # In[6]: | ||
28 | |||
29 | |||
30 | hidden_size=[ 100, 80, 50 , 20 ] | ||
31 | input_activation="relu" | ||
32 | output_activation="relu" | ||
33 | loss="mse" | ||
34 | epochs=3000 | ||
35 | batch=1 | ||
36 | patience=20 | ||
37 | do_do=[ 0 ] * len(hidden_size) | ||
38 | sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | ||
39 | try : | ||
40 | sgd_repr=sgd.get_config()["name"] | ||
41 | except AttributeError : | ||
42 | sgd_repr=sgd | ||
43 | |||
44 | params={ "h1" : "_".join([str(x) for x in hidden_size]), | ||
45 | "inside_activation" : input_activation, | ||
46 | "out_activation" : output_activation, | ||
47 | "do_dropout": "_".join([str(x) for x in do_do]), | ||
48 | "loss" : loss, | ||
49 | "epochs" : epochs , | ||
50 | "batch_size" : batch, | ||
51 | "patience" : patience, | ||
52 | "sgd" : sgd_repr} | ||
53 | name = "_".join([ str(x) for x in params.values()]) | ||
54 | try: | ||
55 | os.mkdir("{}/SAE_{}".format(in_dir,name)) | ||
56 | except: | ||
57 | pass | ||
58 | db = shelve.open("{}/SAE_{}/ae_model.shelve".format(in_dir,name),writeback=True) | ||
59 | # | ||
60 | json.dump(params, | ||
61 | open("{}/SAE_{}/ae_model.json".format(in_dir,name),"w"), | ||
62 | indent=4) | ||
63 | |||
64 | keys = ["ASR","TRS"] | ||
65 | |||
66 | mlp_h = [ 150 , 300 ] | ||
67 | mlp_loss ="categorical_crossentropy" | ||
68 | mlp_dropouts = [0,0,0,0] | ||
69 | mlp_sgd = Adam(0.001) | ||
70 | mlp_epochs = 2000 | ||
71 | mlp_batch_size = 8 | ||
72 | |||
73 | db["SAE"] = {} | ||
74 | |||
75 | db["SAEFT"] = {} | ||
76 | for mod in keys : | ||
77 | print "MODE ", mod | ||
78 | res_tuple=train_sae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"], | ||
79 | infer_model["LDA"][mod]["TEST"], | ||
80 | hidden_size,dropouts=do_do, | ||
81 | patience = params["patience"],sgd=sgd,input_activation="tanh", | ||
82 | output_activation="tanh",loss=loss,epochs=epochs, | ||
83 | batch_size=batch,verbose=0) | ||
84 | #print len(res), [len(x) for x in res[0]], [ len(x) for x in res[1]] | ||
85 | for name , levels in zip(["SAE","SAEFT"],res_tuple): | ||
86 | print "NAME", name | ||
87 | mlp_res_by_level = [] | ||
88 | for res in levels: | ||
89 | mlp_res_list=[] | ||
90 | for nb,layer in enumerate(res) : | ||
91 | print "layer NB",nb | ||
92 | mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], | ||
93 | layer[1],infer_model["LABEL"][mod]["DEV"], | ||
94 | layer[2],infer_model["LABEL"][mod]["TEST"], | ||
95 | mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, | ||
96 | sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size, | ||
97 | fit_verbose=0)) | ||
98 | mlp_res_by_level.append(mlp_res_list) | ||
99 | db[name][mod]=mlp_res_by_level | ||
100 | |||
101 | mod = "ASR" | ||
102 | mod2= "TRS" | ||
103 | print "mode SPE " | ||
104 | res_tuple = train_sae(infer_model["LDA"][mod]["TRAIN"], | ||
105 | infer_model["LDA"][mod]["DEV"], | ||
106 | infer_model["LDA"][mod]["TEST"], | ||
107 | hidden_size,dropouts=[0],patience=params["patience"], | ||
108 | sgd=sgd,input_activation=input_activation,output_activation=input_activation, | ||
109 | loss=loss,epochs=epochs,batch_size=batch, | ||
110 | y_train=infer_model["LDA"][mod2]["TRAIN"], | ||
111 | y_dev=infer_model["LDA"][mod2]["DEV"], | ||
112 | y_test=infer_model["LDA"][mod2]["TEST"]) | ||
113 | |||
114 | for name , levels in zip(["SAE","SAEFT"],res_tuple): | ||
115 | mlp_res_by_level = [] | ||
116 | for res in levels : | ||
117 | mlp_res_list=[] | ||
118 | for layer in res : | ||
119 | mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], | ||
120 | layer[1],infer_model["LABEL"][mod]["DEV"],layer[2], | ||
121 | infer_model["LABEL"][mod]["TEST"], | ||
122 | mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, | ||
123 | sgd=mlp_sgd,epochs=mlp_epochs,batch_size=mlp_batch_size, | ||
124 | fit_verbose=0)) | ||
125 | mlp_res_by_level.append(mlp_res_list) | ||
126 | db[name]["SPE"] = mlp_res_by_level | ||
127 | |||
128 | db.close() | ||
129 |
LDA/04d-mmf_dsae.py
File was created | 1 | ||
2 | # coding: utf-8 | ||
3 | |||
4 | # In[2]: | ||
5 | |||
6 | # Import | ||
7 | import gensim | ||
8 | from scipy import sparse | ||
9 | import itertools | ||
10 | from sklearn import preprocessing | ||
11 | from keras.models import Sequential | ||
12 | from keras.optimizers import SGD,Adam | ||
13 | from mlp import * | ||
14 | import mlp | ||
15 | import sklearn.metrics | ||
16 | import shelve | ||
17 | import pickle | ||
18 | from utils import * | ||
19 | import sys | ||
20 | import os | ||
21 | import json | ||
22 | # In[4]: | ||
23 | |||
24 | infer_model=shelve.open("{}".format(sys.argv[2])) | ||
25 | in_dir = sys.argv[1] | ||
26 | #['ASR', 'TRS', 'LABEL'] | ||
27 | # In[6]: | ||
28 | |||
29 | # AE params | ||
30 | hidden_size=[ 100, 100 ] | ||
31 | input_activation="relu" | ||
32 | output_activation="relu" | ||
33 | loss="mse" | ||
34 | epochs= 1000 | ||
35 | batch_size=1 | ||
36 | patience=20 | ||
37 | do_do=[ 0.25 ] * len(hidden_size) | ||
38 | sgd = Adam(lr=0.00001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | ||
39 | try : | ||
40 | sgd_repr=sgd.get_config()["name"] | ||
41 | except AttributeError : | ||
42 | sgd_repr=sgd | ||
43 | |||
44 | # Transforme : | ||
45 | trans_hidden_size=[ 300 , 300 ] | ||
46 | trans_input_activation="relu" | ||
47 | trans_output_activation="relu" | ||
48 | trans_loss="mse" | ||
49 | trans_epochs=1000 | ||
50 | trans_batch_size=8 | ||
51 | trans_patience=20 | ||
52 | trans_do=[ 0.25 ] * len(trans_hidden_size) | ||
53 | trans_sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | ||
54 | try : | ||
55 | trans_sgd_repr=trans_sgd.get_config()["name"] | ||
56 | except AttributeError : | ||
57 | trans_sgd_repr=trans_sgd | ||
58 | |||
59 | |||
60 | |||
61 | ae={ "h1" : "_".join([str(x) for x in hidden_size]), | ||
62 | "inside_activation" : input_activation, | ||
63 | "out_activation" : output_activation, | ||
64 | "do_dropout": "_".join([str(x) for x in do_do]), | ||
65 | "loss" : loss, | ||
66 | "epochs" : epochs , | ||
67 | "batch_size" : batch_size, | ||
68 | "patience" : patience, | ||
69 | "sgd" : sgd_repr} | ||
70 | name = "_".join([ str(x) for x in ae.values()]) | ||
71 | |||
72 | trans={ "h1" : "_".join([str(x) for x in trans_hidden_size]), | ||
73 | "inside_activation" : trans_input_activation, | ||
74 | "out_activation" : trans_output_activation, | ||
75 | "do_dropout": "_".join([str(x) for x in trans_do]), | ||
76 | "loss" : trans_loss, | ||
77 | "epochs" : trans_epochs , | ||
78 | "batch_size" : trans_batch_size, | ||
79 | "patience" : trans_patience, | ||
80 | "sgd" : trans_sgd_repr} | ||
81 | |||
82 | mlp_h = [ 300 , 300 ] | ||
83 | mlp_loss ="categorical_crossentropy" | ||
84 | mlp_dropouts = [0,0,0,0] | ||
85 | mlp_sgd = Adam(0.0001) | ||
86 | mlp_epochs = 1000 | ||
87 | mlp_batch_size = 8 | ||
88 | mlp_input_activation = "relu" | ||
89 | mlp_output_activation = "softmax" | ||
90 | |||
91 | try : | ||
92 | mlp_sgd_repr=mlp_sgd.get_config()["name"] | ||
93 | except AttributeError : | ||
94 | mlp_sgd_repr=mlp_sgd | ||
95 | |||
96 | |||
97 | |||
98 | mlp={ "h1" : "_".join([str(x) for x in mlp_h ]), | ||
99 | "inside_activation" : mlp_input_activation, | ||
100 | "out_activation" : mlp_output_activation, | ||
101 | "do_dropout": "_".join([str(x) for x in mlp_dropouts]), | ||
102 | "loss" : mlp_loss, | ||
103 | "epochs" : mlp_epochs , | ||
104 | "batch_size" : mlp_batch_size, | ||
105 | "sgd" : mlp_sgd_repr} | ||
106 | |||
107 | params = { "ae":ae, "trans":trans, "mlp":mlp} | ||
108 | try: | ||
109 | os.mkdir("{}/DSAE_{}".format(in_dir,name)) | ||
110 | except: | ||
111 | pass | ||
112 | db = shelve.open("{}/DSAE_{}/ae_model.shelve".format(in_dir,name),writeback=True) | ||
113 | # | ||
114 | json.dump(params, | ||
115 | open("{}/DSAE_{}/ae_model.json".format(in_dir,name),"w"), | ||
116 | indent=4) | ||
117 | |||
118 | keys = ["ASR","TRS"] | ||
119 | |||
120 | |||
121 | |||
122 | db["DSAE"] = {} | ||
123 | |||
124 | db["DSAEFT"] = {} | ||
125 | mod = "ASR" | ||
126 | res_tuple_ASR = train_ae(infer_model["LDA"][mod]["TRAIN"], | ||
127 | infer_model["LDA"][mod]["DEV"], | ||
128 | infer_model["LDA"][mod]["TEST"], | ||
129 | hidden_size,dropouts=do_do, | ||
130 | patience = patience,sgd=sgd, | ||
131 | input_activation=input_activation, | ||
132 | output_activation=output_activation,loss=loss,epochs=epochs, | ||
133 | batch_size=batch_size,verbose=0,get_weights=True) | ||
134 | mlp_res_list = [] | ||
135 | for layer in res_tuple_ASR[0]: | ||
136 | mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], | ||
137 | layer[1],infer_model["LABEL"][mod]["DEV"], | ||
138 | layer[2],infer_model["LABEL"][mod]["TEST"], | ||
139 | mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, | ||
140 | sgd=mlp_sgd,epochs=mlp_epochs, | ||
141 | output_activation=mlp_output_activation, | ||
142 | input_activation=mlp_input_activation, | ||
143 | batch_size=mlp_batch_size,fit_verbose=0)) | ||
144 | |||
145 | db["DSAE"][mod] = mlp_res_list | ||
146 | mod = "TRS" | ||
147 | print hidden_size | ||
148 | res_tuple_TRS = train_ae(infer_model["LDA"][mod]["TRAIN"], | ||
149 | infer_model["LDA"][mod]["DEV"], | ||
150 | infer_model["LDA"][mod]["TEST"], | ||
151 | hidden_size,dropouts=do_do, | ||
152 | sgd=sgd,input_activation=input_activation, | ||
153 | output_activation=output_activation,loss=loss,epochs=epochs, | ||
154 | batch_size=batch_size,patience=patience, | ||
155 | verbose=0,get_weights=True) | ||
156 | |||
157 | mlp_res_list = [] | ||
158 | for layer in res_tuple_TRS[0]: | ||
159 | mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], | ||
160 | layer[1],infer_model["LABEL"][mod]["DEV"], | ||
161 | layer[2],infer_model["LABEL"][mod]["TEST"], | ||
162 | mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, | ||
163 | sgd=mlp_sgd,epochs=mlp_epochs, | ||
164 | output_activation=mlp_output_activation, | ||
165 | input_activation=mlp_input_activation, | ||
166 | batch_size=mlp_batch_size,fit_verbose=0)) | ||
167 | |||
168 | db["DSAE"][mod] = mlp_res_list | ||
169 | |||
170 | |||
171 | |||
172 | transfert = [] | ||
173 | |||
174 | print " get weight trans" | ||
175 | |||
176 | for asr_pred, trs_pred in zip(res_tuple_ASR[0], res_tuple_TRS[0]): | ||
177 | print "ASR", [ x.shape for x in asr_pred] | ||
178 | |||
179 | print "TRS", [ x.shape for x in trs_pred] | ||
180 | |||
181 | |||
182 | for asr_pred, trs_pred in zip(res_tuple_ASR[0], res_tuple_TRS[0]): | ||
183 | print "ASR", [ x.shape for x in asr_pred] | ||
184 | |||
185 | print "TRS", [ x.shape for x in trs_pred] | ||
186 | transfert.append( train_ae(asr_pred[0], | ||
187 | asr_pred[1], | ||
188 | asr_pred[2], | ||
189 | trans_hidden_size, | ||
190 | dropouts=trans_do, | ||
191 | y_train = trs_pred[0], | ||
192 | y_dev=trs_pred[1], | ||
193 | y_test = trs_pred[2], | ||
194 | patience = trans_patience,sgd=trans_sgd, | ||
195 | input_activation=trans_input_activation, | ||
196 | output_activation=trans_output_activation, | ||
197 | loss=trans_loss, | ||
198 | epochs=trans_epochs, | ||
199 | batch_size=trans_batch_size,verbose=0,get_weights=True) ) | ||
200 | mod = "ASR" | ||
201 | mlp_res_bylvl = [] | ||
202 | print " MLP on transfert " | ||
203 | for level, w in transfert : | ||
204 | mlp_res_list = [] | ||
205 | for layer in level : | ||
206 | mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], | ||
207 | layer[1],infer_model["LABEL"][mod]["DEV"], | ||
208 | layer[2],infer_model["LABEL"][mod]["TEST"], | ||
209 | mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, | ||
210 | sgd=mlp_sgd,epochs=mlp_epochs, | ||
211 | output_activation=mlp_output_activation, | ||
212 | input_activation=mlp_input_activation, | ||
213 | batch_size=mlp_batch_size,fit_verbose=0)) | ||
214 | mlp_res_bylvl.append(mlp_res_list) | ||
215 | db["DSAE"]["transfert"] = mlp_res_bylvl | ||
216 | |||
217 | |||
218 | print " FT " | ||
219 | WA = res_tuple_ASR[1] | ||
220 | print "WA", len(WA), [ len(x) for x in WA] | ||
221 | WT = res_tuple_TRS[1] | ||
222 | |||
223 | print "WT", len(WT), [ len(x) for x in WT] | ||
224 | Wtr = [ x[1] for x in transfert] | ||
225 | |||
226 | print "Wtr", len(Wtr), [ len(x) for x in Wtr],[ len(x[1]) for x in Wtr] | ||
227 | |||
228 | ft_res = ft_dsae(infer_model["LDA"]["ASR"]["TRAIN"], | ||
229 | infer_model["LDA"]["ASR"]["DEV"], | ||
230 | infer_model["LDA"]["ASR"]["TEST"], | ||
231 | y_train=infer_model["LDA"]["TRS"]["TRAIN"], | ||
232 | y_dev=infer_model["LDA"]["TRS"]["DEV"], | ||
233 | y_test=infer_model["LDA"]["TRS"]["TEST"], | ||
234 | ae_hidden = hidden_size, | ||
235 | transfer_hidden = trans_hidden_size, | ||
236 | start_weights = WA, | ||
237 | transfer_weights = Wtr, | ||
238 | end_weights = WT, | ||
239 | input_activation = input_activation, | ||
240 | output_activation = output_activation, | ||
241 | ae_dropouts= do_do, | ||
242 | transfer_do = trans_do, | ||
243 | sgd = sgd, | ||
244 | loss = loss , | ||
245 | patience = patience, | ||
246 | batch_size = batch_size, | ||
247 | epochs= epochs) | ||
248 | mlps_by_lvls= [] | ||
249 | for level in ft_res : | ||
250 | mlp_res_list = [] | ||
251 | for layer in level : | ||
252 | mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], | ||
253 | layer[1],infer_model["LABEL"][mod]["DEV"], | ||
254 | layer[2],infer_model["LABEL"][mod]["TEST"], | ||
255 | mlp_h,loss=mlp_loss,dropouts=mlp_dropouts, | ||
256 | sgd=mlp_sgd,epochs=mlp_epochs, | ||
257 | output_activation=mlp_output_activation, | ||
258 | input_activation=mlp_input_activation, | ||
259 | batch_size=mlp_batch_size,fit_verbose=0)) | ||
260 | mlps_by_lvls.append(mlp_res_list) | ||
261 | |||
262 | |||
263 | db["DSAEFT"]["transfert"] = mlps_by_lvls | ||
264 | |||
265 | db.close() | ||
266 |
LDA/04e-mm_vae.py
File was created | 1 | ||
2 | # coding: utf-8 | ||
3 | |||
4 | # In[2]: | ||
5 | |||
6 | # Import | ||
7 | import gensim | ||
8 | from scipy import sparse | ||
9 | import itertools | ||
10 | from sklearn import preprocessing | ||
11 | from keras.models import Sequential | ||
12 | from keras.optimizers import SGD,Adam | ||
13 | from mlp import * | ||
14 | from vae import * | ||
15 | import sklearn.metrics | ||
16 | import shelve | ||
17 | import pickle | ||
18 | from utils import * | ||
19 | import sys | ||
20 | import os | ||
21 | import json | ||
22 | # In[4]: | ||
23 | |||
24 | infer_model=shelve.open("{}".format(sys.argv[2])) | ||
25 | in_dir = sys.argv[1] | ||
26 | #['ASR', 'TRS', 'LABEL'] | ||
27 | # In[6]: | ||
28 | |||
29 | |||
30 | hidden_size= [60] | ||
31 | input_activation="tanh" | ||
32 | output_activation="sigmoid" | ||
33 | epochs=300 | ||
34 | batch=1 | ||
35 | patience=60 | ||
36 | sgd = Adam(lr=0.0001)#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True) | ||
37 | latent_dim = 30 | ||
38 | |||
39 | |||
40 | |||
41 | mlp_h = [ 256 ] | ||
42 | mlp_loss = "categorical_crossentropy" | ||
43 | mlp_dropouts = [] | ||
44 | mlp_sgd = Adam(lr=0.001) | ||
45 | mlp_epochs = 1000 | ||
46 | mlp_batch_size = 16 | ||
47 | mlp_output_activation="softmax" | ||
48 | |||
49 | try : | ||
50 | sgd_repr=sgd.get_config()["name"] | ||
51 | except AttributeError : | ||
52 | sgd_repr=sgd | ||
53 | |||
54 | try : | ||
55 | mlp_sgd_repr=mlp_sgd.get_config()["name"] | ||
56 | except AttributeError : | ||
57 | mlp_sgd_repr=mlp_sgd | ||
58 | |||
59 | |||
60 | params={ "h1" : "_".join([ str(x) for x in hidden_size ]), | ||
61 | "inside_activation" : input_activation, | ||
62 | "output_activation" : output_activation, | ||
63 | "epochs" : epochs , | ||
64 | "batch_size" : batch, | ||
65 | "patience" : patience, | ||
66 | "sgd" : sgd_repr, | ||
67 | "mlp_h ": "_".join([str(x) for x in mlp_h]), | ||
68 | "mlp_loss ": mlp_loss, | ||
69 | "mlp_dropouts ": "_".join([str(x) for x in mlp_dropouts]), | ||
70 | "mlp_sgd ": mlp_sgd_repr, | ||
71 | "mlp_epochs ": mlp_epochs, | ||
72 | "mlp_batch_size ": mlp_batch_size, | ||
73 | "mlp_output" : mlp_output_activation | ||
74 | } | ||
75 | name = "_".join([ str(x) for x in params.values()]) | ||
76 | try: | ||
77 | os.mkdir("{}/VAE_{}".format(in_dir,name)) | ||
78 | except: | ||
79 | pass | ||
80 | db = shelve.open("{}/VAE_{}/ae_model.shelve".format(in_dir,name),writeback=True) | ||
81 | db["params"] = params | ||
82 | db["LABEL"]=infer_model["LABEL"] | ||
83 | # | ||
84 | json.dump(params, | ||
85 | open("{}/VAE_{}/ae_model.json".format(in_dir,name),"w"), | ||
86 | indent=4) | ||
87 | |||
88 | keys = ["ASR","TRS"] | ||
89 | |||
90 | db["VAE"] = {} | ||
91 | db["LDA"] = {} | ||
92 | for mod in keys : | ||
93 | print mod | ||
94 | db["LDA"][mod] = train_mlp(infer_model["LDA"][mod]["TRAIN"],infer_model["LABEL"][mod]["TRAIN"], | ||
95 | infer_model["LDA"][mod]["DEV"],infer_model["LABEL"][mod]["DEV"], | ||
96 | infer_model["LDA"][mod]["TEST"],infer_model["LABEL"][mod]["TEST"], | ||
97 | mlp_h ,sgd=mlp_sgd, | ||
98 | epochs=mlp_epochs, | ||
99 | batch_size=mlp_batch_size, | ||
100 | input_activation=input_activation, | ||
101 | output_activation=mlp_output_activation, | ||
102 | dropouts=mlp_dropouts, | ||
103 | fit_verbose=0) | ||
104 | |||
105 | res=train_vae(infer_model["LDA"][mod]["TRAIN"],infer_model["LDA"][mod]["DEV"],infer_model["LDA"][mod]["TEST"], | ||
106 | hidden_size=hidden_size[0], | ||
107 | latent_dim=latent_dim,sgd=sgd, | ||
108 | input_activation=input_activation,output_activation=output_activation, | ||
109 | nb_epochs=epochs,batch_size=batch) | ||
110 | mlp_res_list=[] | ||
111 | for layer in res : | ||
112 | mlp_res_list.append(train_mlp(layer[0],infer_model['LABEL'][mod]["TRAIN"], | ||
113 | layer[1],infer_model["LABEL"][mod]["DEV"], | ||
114 | layer[2],infer_model["LABEL"][mod]["TEST"], | ||
115 | mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, | ||
116 | output_activation=mlp_output_activation, | ||
117 | input_activation=input_activation, | ||
118 | batch_size=mlp_batch_size,fit_verbose=0)) | ||
119 | db["VAE"][mod]=mlp_res_list | ||
120 | |||
121 | mod = "ASR" | ||
122 | mod2= "TRS" | ||
123 | mlp_res_list=[] | ||
124 | |||
125 | res = train_vae(infer_model["LDA"][mod]["TRAIN"], | ||
126 | infer_model["LDA"][mod]["DEV"], | ||
127 | infer_model["LDA"][mod]["TEST"], | ||
128 | hidden_size=hidden_size[0], | ||
129 | sgd=sgd,input_activation=input_activation,output_activation=output_activation, | ||
130 | latent_dim=latent_dim, | ||
131 | nb_epochs=epochs, | ||
132 | batch_size=batch, | ||
133 | y_train=infer_model["LDA"][mod2]["TRAIN"], | ||
134 | y_dev=infer_model["LDA"][mod2]["DEV"], | ||
135 | y_test=infer_model["LDA"][mod2]["TEST"]) | ||
136 | |||
137 | for layer in res : | ||
138 | mlp_res_list.append(train_mlp(layer[0],infer_model["LABEL"][mod]["TRAIN"], | ||
139 | layer[1],infer_model["LABEL"][mod]["DEV"], | ||
140 | layer[2],infer_model["LABEL"][mod]["TEST"], | ||
141 | mlp_h,loss=mlp_loss,dropouts=mlp_dropouts,sgd=mlp_sgd,epochs=mlp_epochs, | ||
142 | output_activation=mlp_output_activation, | ||
143 | input_activation=input_activation, | ||
144 | batch_size=mlp_batch_size,fit_verbose=0)) | ||
145 | |||
146 | db["VAE"]["SPE"] = mlp_res_list | ||
147 | |||
148 | db.sync() | ||
149 | db.close() | ||
150 |
LDA/05-mmf_getscore.py
File was created | 1 | import numpy as np | |
2 | import shelve | ||
3 | import sys | ||
4 | import glob | ||
5 | from collections import defaultdict | ||
6 | from tinydb import TinyDB, Query | ||
7 | from mako.template import Template | ||
8 | import time | ||
9 | |||
10 | def get_best(x): | ||
11 | argbest=np.argmax(x[1]) | ||
12 | maxdev=x[1][argbest] | ||
13 | maxtrain=np.max(x[0]) | ||
14 | maxtest=np.max(x[2]) | ||
15 | besttest=x[2][argbest] | ||
16 | return ( maxtrain,maxdev,maxtest,besttest) | ||
17 | depth = lambda L: isinstance(L, list) and max(map(depth, L))+1 | ||
18 | |||
19 | |||
20 | template_name = ''' | ||
21 | ${name} | ||
22 | ======================== | ||
23 | |||
24 | MLP scores : | ||
25 | ------------------- | ||
26 | ''' | ||
27 | template_value='''\n\n | ||
28 | | ${model} ${ttype} | train | dev |max test| best test| | ||
29 | | -------------------:|:--------:|:---------:|:------:|:--------:| | ||
30 | % for cpt,line in enumerate(models[model][ttype]): | ||
31 | | ${cpt} | ${line[0]} | ${line[1]} |${line[2]} | ${line[3]} | | ||
32 | % endfor | ||
33 | \n | ||
34 | ''' | ||
35 | |||
36 | # ae_model.shelve | ||
37 | def get_folder_file(x): | ||
38 | folder=x.split("/")[1] | ||
39 | shelve_file = ".".join(x.split(".")[:-1]) | ||
40 | return(folder,shelve_file) | ||
41 | |||
42 | in_folder = sys.argv[1] | ||
43 | |||
44 | |||
45 | models = defaultdict(dict) | ||
46 | |||
47 | ae_model_list = glob.glob("{}/*/ae_model.shelve.dir".format(in_folder)) | ||
48 | ae_model_list = sorted(ae_model_list) | ||
49 | ae_model_list= map(get_folder_file,ae_model_list) | ||
50 | for name , shelve_file in ae_model_list : | ||
51 | print Template(template_name).render(name=name) | ||
52 | opened_shelve = shelve.open(shelve_file) | ||
53 | keys = opened_shelve.keys() | ||
54 | if "LABEL" in keys : | ||
55 | keys.remove("LABEL") | ||
56 | if "params" in keys: | ||
57 | keys.remove("params") | ||
58 | to_print = [] | ||
59 | for working_key in keys: | ||
60 | for key in opened_shelve[working_key].keys(): | ||
61 | table_depth = depth(opened_shelve[working_key][key]) | ||
62 | if table_depth == 3 : | ||
63 | models[working_key][key] = [ get_best(x) for x in opened_shelve[working_key][key] ] | ||
64 | to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip()) | ||
65 | elif table_depth == 2 : | ||
66 | models[working_key][key] = [ get_best(opened_shelve[working_key][key]) ] | ||
67 | to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip()) | ||
68 | elif table_depth == 4 : | ||
69 | for layer in opened_shelve[working_key][key] : | ||
70 | models[working_key][key] = [ get_best(x) for x in layer ] | ||
71 | to_print.append(Template(template_value).render(model=working_key,ttype=key,models=models).strip()) | ||
72 | print "\n".join(to_print) | ||
73 | |||
74 |
LDA/run2.sh
1 | #python 00-prepross.py | 1 | #python 00-prepross.py |
2 | python 02-lda-order.py DECODA_list_wid.shelve output_v5/perplex.db 50 10 output_v5 50_500 0 1_0.1 1_0.1 500_1000 100_2000 | 2 | python 02-lda.py DECODA_list_wid.shelve output_v5/perplex.db 50 10 output_v5 50_500 0 1_0.1 1_0.1 500_1000 100_2000 |
3 | #python 03-perplex.py DECODA_list_wid.shelve output_v5 output_v5/perplex.db | 3 | #python 03-perplex.py DECODA_list_wid.shelve output_v5 output_v5/perplex.db |
4 | python 03-order_by_perp.py output_v5/perplex.db output_v5 | 4 | python 03-order_by_perp.py output_v5/perplex.db output_v5 |
5 | bash 04-run_mlp_ae.sh output_v5 DECODA_list_wid.shelve | 5 | bash 04-run_mlp_ae.sh output_v5 DECODA_list_wid.shelve |
6 | python 05-getscore.py output_v5 > res.mkd | 6 | python 05-getscore.py output_v5 > res.mkd |
7 | notedown res.mkd >res_v5.ipynb | 7 | notedown res.mkd >res_v5.ipynb |
8 | 8 |
LDA/utils.py
1 | # -*- coding: utf-8 -*- | 1 | # -*- coding: utf-8 -*- |
2 | import nltk | 2 | import nltk |
3 | import re | 3 | import re |
4 | pattern = ur"\d+(?:\.\d+)?\s*%?|\w{1,2}'|<unk>|[\wéàèùêôûâòìîç]+|[^\w\s]" | 4 | pattern = ur"\d+(?:\.\d+)?\s*%?|\w{1,2}'|<unk>|[\wéàèùêôûâòìîç]+|[^\w\s]" |
5 | rer_b = re.compile(ur" r e r(?: e r)? b ") | 5 | rer_b = re.compile(ur" r e r(?: e r)? b ") |
6 | rer_c = re.compile(ur" r e r(?: e r)? c |r e r( e r)? c' est | rer c' est") | 6 | rer_c = re.compile(ur" r e r(?: e r)? c |r e r( e r)? c' est | rer c' est") |
7 | rer = re.compile(ur" (e )?r e r(?: e r)? |re r( e r)? |rer e r | r e rer | r e r | r e rer |r( e r)+ ") | 7 | rer = re.compile(ur" (e )?r e r(?: e r)? |re r( e r)? |rer e r | r e rer | r e r | r e rer |r( e r)+ ") |
8 | sncf = re.compile(ur" s n c f ") | 8 | sncf = re.compile(ur" s n c f ") |
9 | jusq = re.compile(ur" jusqu ' ") | 9 | jusq = re.compile(ur" jusqu ' ") |
10 | ratp = re.compile(ur" r a t(?: p)? ") | 10 | ratp = re.compile(ur" r a t(?: p)? ") |
11 | quel = re.compile(ur" quelqu ' ") | 11 | quel = re.compile(ur" quelqu ' ") |
12 | space = re.compile(ur" +") | 12 | space = re.compile(ur" +") |
13 | tok2 = nltk.RegexpTokenizer(pattern,flags=re.UNICODE ) | 13 | tok2 = nltk.RegexpTokenizer(pattern,flags=re.UNICODE ) |
14 | # (?x)\d+(\.\d+)?\s*%| \w'| \w+| [^\w\s] | 14 | # (?x)\d+(\.\d+)?\s*%| \w'| \w+| [^\w\s] |
15 | 15 | ||
16 | def preproc(line): | 16 | def preproc(line): |
17 | # print 1,line.encode('utf8') | 17 | # print 1,line.encode('utf8') |
18 | line = space.subn(u" ",line)[0] | 18 | line = space.subn(u" ",line)[0] |
19 | line = rer_b.subn(u" rer b ",line)[0] | 19 | line = rer_b.subn(u" rer b ",line)[0] |
20 | line = rer_c.subn(u" rer c ",line)[0] | 20 | line = rer_c.subn(u" rer c ",line)[0] |
21 | line = rer.subn(u" rer ",line)[0] | 21 | line = rer.subn(u" rer ",line)[0] |
22 | line = sncf.subn(u" sncf ",line)[0] | 22 | line = sncf.subn(u" sncf ",line)[0] |
23 | line = ratp.subn(u" ratp ",line)[0] | 23 | line = ratp.subn(u" ratp ",line)[0] |
24 | line = jusq.subn(u" jusqu' ",line)[0] | 24 | line = jusq.subn(u" jusqu' ",line)[0] |
25 | line = quel.subn(u" quelqu' ",line)[0] | 25 | line = quel.subn(u" quelqu' ",line)[0] |
26 | line = space.subn(u" ",line)[0] | 26 | line = space.subn(u" ",line)[0] |
27 | # print 2,line.encode('utf8') | 27 | # print 2,line.encode('utf8') |
28 | return line.lower() | 28 | return line.lower() |
29 | 29 | ||
30 | def yield_corpus(df_list): | 30 | def yield_corpus(df_list): |
31 | for corpus in df_list: | 31 | for corpus in df_list: |
32 | for id,doc in corpus.iterrows(): | 32 | for id,doc in corpus.iterrows(): |
33 | try: | 33 | try: |
34 | a = tok2.tokenize(preproc(doc[2].decode("utf-8"))) | 34 | a = tok2.tokenize(preproc(doc[2].decode("utf-8"))) |
35 | # print 3, " ".join(a).encode("utf8") | 35 | # print 3, " ".join(a).encode("utf8") |
36 | yield a | 36 | yield a |
37 | except: | 37 | except: |
38 | print doc[2] | 38 | print doc[2] |
39 | raise | 39 | raise |
40 | def select(elm): | 40 | def select(elm): |
41 | return int(elm.split("_")[-1]) | 41 | return int(elm.split("_")[-1]) |
42 | |||
43 | |||
44 | def select_mmf(elm): | ||
45 | return int(elm.split("_")[0]) | ||
42 | 46 |
LDA/vae.py
File was created | 1 | '''This script demonstrates how to build a variational autoencoder with Keras. | |
2 | Reference: "Auto-Encoding Variational Bayes" https://arxiv.org/abs/1312.6114 | ||
3 | ''' | ||
4 | |||
5 | import itertools | ||
6 | import sys | ||
7 | import json | ||
8 | |||
9 | import numpy as np | ||
10 | import matplotlib.pyplot as plt | ||
11 | from scipy import sparse | ||
12 | import scipy.io | ||
13 | |||
14 | from keras.layers import Input, Dense, Lambda | ||
15 | from keras.models import Model | ||
16 | from keras import backend as K | ||
17 | from keras import objectives | ||
18 | from keras.datasets import mnist | ||
19 | |||
20 | import pandas | ||
21 | import shelve | ||
22 | import pickle | ||
23 | |||
24 | |||
25 | |||
26 | |||
27 | |||
28 | #batch_size = 16 | ||
29 | #original_dim = 784 | ||
30 | #latent_dim = 2 | ||
31 | #intermediate_dim = 128 | ||
32 | #epsilon_std = 0.01 | ||
33 | #nb_epoch = 40 | ||
34 | |||
35 | |||
36 | |||
37 | |||
38 | def train_vae(x_train,x_dev,x_test,y_train=None,y_dev=None,y_test=None,hidden_size=80,latent_dim=12,batch_size=8,nb_epochs=10,sgd="rmsprop",input_activation = "relu",output_activation = "sigmoid",epsilon_std=0.01): | ||
39 | |||
40 | |||
41 | |||
42 | def sampling(args): | ||
43 | z_mean, z_log_std = args | ||
44 | epsilon = K.random_normal(shape=(batch_size, latent_dim), | ||
45 | mean=0., std=epsilon_std) | ||
46 | return z_mean + K.exp(z_log_std) * epsilon | ||
47 | |||
48 | def vae_loss(x, x_decoded_mean): | ||
49 | xent_loss = objectives.binary_crossentropy(x, x_decoded_mean) | ||
50 | kl_loss = - 0.5 * K.mean(1 + z_log_std - K.square(z_mean) - K.exp(z_log_std), axis=-1) | ||
51 | return xent_loss + kl_loss | ||
52 | |||
53 | original_dim = x_train.shape[1] | ||
54 | |||
55 | |||
56 | x = Input(batch_shape=(batch_size, original_dim)) | ||
57 | h = Dense(hidden_size, activation=input_activation)(x) | ||
58 | z_mean = Dense(latent_dim)(h) | ||
59 | z_log_std = Dense(latent_dim)(h) | ||
60 | |||
61 | |||
62 | # note that "output_shape" isn't necessary with the TensorFlow backend | ||
63 | # so you could write `Lambda(sampling)([z_mean, z_log_std])` | ||
64 | z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_std]) | ||
65 | |||
66 | # we instantiate these layers separately so as to reuse them later | ||
67 | decoder_h = Dense(hidden_size, activation=input_activation) | ||
68 | decoder_mean = Dense(original_dim, activation=output_activation) | ||
69 | h_decoded = decoder_h(z) | ||
70 | x_decoded_mean = decoder_mean(h_decoded) | ||
71 | |||
72 | |||
73 | vae = Model(x, x_decoded_mean) | ||
74 | vae.compile(optimizer=sgd, loss=vae_loss) | ||
75 | |||
76 | # train the VAE on MNIST digits | ||
77 | if y_train is None or y_dev is None or y_test is None : | ||
78 | y_train = x_train | ||
79 | y_dev = x_dev | ||
80 | y_test = x_test | ||
81 | |||
82 | vae.fit(x_train, y_train, | ||
83 | shuffle=True, | ||
84 | nb_epoch=nb_epochs, | ||
85 | batch_size=batch_size, | ||
86 | validation_data=(x_dev, y_dev)) | ||
87 | |||
88 | # build a model to project inputs on the latent space | ||
89 | encoder = Model(x, z_mean) | ||
90 | pred_train = encoder.predict(x_train, batch_size=batch_size) | ||
91 | pred_dev = encoder.predict(x_dev, batch_size=batch_size) | ||
92 | pred_test = encoder.predict(x_test,batch_size=batch_size) | ||
93 | return [ [ pred_train, pred_dev, pred_test ] ] | ||
94 | # display a 2D plot of the digit classes in the latent space | ||
95 | #x_test_encoded = encoder.predict(x_test, batch_size=batch_size) | ||
96 | # build a digit generator that can sample from the learned distribution | ||
97 | #decoder_input = Input(shape=(latent_dim,)) | ||
98 | #_h_decoded = decoder_h(decoder_input) | ||
99 | #_x_decoded_mean = decoder_mean(_h_decoded) | ||
100 | #generator = Model(decoder_input, _x_decoded_mean) | ||
101 | #x_decoded = generator.predict(z_sample) | ||
102 | |||
103 |