Commit d1012a7a1689588ac0d1e4a716497562663c14c2
1 parent
ee9023b1c9
Exists in
master
update LDA/.py
Showing 7 changed files with 8 additions and 289 deletions Side-by-side Diff
LDA/00-mmf_make_features.py
... | ... | @@ -21,9 +21,9 @@ |
21 | 21 | data["LABEL"]= {} |
22 | 22 | data["LDA"] = {"ASR":{},"TRS":{}} |
23 | 23 | for mod in ["ASR", "TRS" ]: |
24 | - train = pandas.read_table("{}/{}/train_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | |
25 | - dev = pandas.read_table("{}/{}/dev_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | |
26 | - test = pandas.read_table("{}/{}/test_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | |
24 | + train = pandas.read_table("{}/{}/train_{}.tab".format(input_dir, mod, level), sep=" ", header=None ) | |
25 | + dev = pandas.read_table("{}/{}/dev_{}.tab".format(input_dir, mod, level), sep=" ", header=None ) | |
26 | + test = pandas.read_table("{}/{}/test_{}.tab".format(input_dir, mod, level), sep=" ", header=None ) | |
27 | 27 | |
28 | 28 | y_train = train.iloc[:,0].apply(select) |
29 | 29 | y_dev = dev.iloc[:,0].apply(select) |
30 | 30 | |
... | ... | @@ -32,12 +32,12 @@ |
32 | 32 | data["LABEL"][mod]={"TRAIN":lb.transform(y_train),"DEV":lb.transform(y_dev), "TEST": lb.transform(y_test)} |
33 | 33 | |
34 | 34 | # data["LDA"][mod]={'ASR':[]} |
35 | - print data["LDA"][mod] | |
36 | 35 | print train.values |
37 | 36 | data["LDA"][mod]["TRAIN"]=train.iloc[:,1:-1].values |
38 | 37 | data["LDA"][mod]["DEV"]=dev.iloc[:,1:-1].values |
39 | 38 | data["LDA"][mod]["TEST"]=test.iloc[:,1:-1].values |
40 | 39 | |
40 | + print data["LDA"][mod]["TRAIN"].shape | |
41 | 41 | data.sync() |
42 | 42 | data.close() |
LDA/02-lda_split.py
1 | -import gensim | |
2 | -import os | |
3 | -import sys | |
4 | -import pickle | |
5 | -from gensim.models.ldamodel import LdaModel | |
6 | -from gensim.models.ldamulticore import LdaMulticore | |
7 | -from collections import Counter | |
8 | -import numpy as np | |
9 | -import codecs | |
10 | -import shelve | |
11 | -import logging | |
12 | - | |
13 | -def calc_perp(in_dir,train): | |
14 | - name = in_dir.split("/")[-1] | |
15 | - # s40_it1_sw50_a0.01_e0.1_p6_c1000 | |
16 | - sw_size = int(name.split("_")[2][2:]) | |
17 | - | |
18 | - logging.warning(" go {} ".format(name)) | |
19 | - | |
20 | - | |
21 | - logging.warning("Redo Vocab and stop") | |
22 | - asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) | |
23 | - trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) | |
24 | - asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] | |
25 | - trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] | |
26 | - stop_words=set(asr_sw) | set(trs_sw) | |
27 | - | |
28 | - logging.warning("TRS to be done") | |
29 | - entry = Query() | |
30 | - value=db.search(entry.name == name) | |
31 | - if len(value) > 0 : | |
32 | - logging.warning("{} already done".format(name)) | |
33 | - return | |
34 | - | |
35 | - dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] | |
36 | - lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir)) | |
37 | - perp_trs = lda_trs.log_perplexity(dev_trs) | |
38 | - logging.warning("ASR to be done") | |
39 | - dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] | |
40 | - lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir)) | |
41 | - perp_asr = lda_asr.log_perplexity(dev_asr) | |
42 | - logging.warning("ASR saving") | |
43 | - res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs} | |
44 | - return res_dict | |
45 | - | |
46 | - | |
47 | - | |
48 | - | |
49 | -def train_lda(out_dir,train,name,size,it,sw_size,alpha,eta,passes,chunk): | |
50 | - output_dir = "{}/s{}_it{}_sw{}_a{}_e{}_p{}_c{}".format(out_dir,size,it,sw_size,alpha,eta,passes,chunk) | |
51 | - os.mkdir(output_dir) | |
52 | - logging.info(output_dir+" to be done") | |
53 | - asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) | |
54 | - trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) | |
55 | - asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] | |
56 | - trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] | |
57 | - stop_words=set(asr_sw) | set(trs_sw) | |
58 | - | |
59 | - logging.info("TRS to be done") | |
60 | - | |
61 | - lda_trs = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=1000,iterations=it) | |
62 | - | |
63 | - logging.info("ASR to be done") | |
64 | - lda_asr = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=1000,iterations=it) | |
65 | - | |
66 | - #logger.info("ASR saving") | |
67 | - #lda_asr.save("{}/lda_asr.model".format(output_dir,name,size,it)) | |
68 | - #lda_trs.save("{}/lda_trs.model".format(output_dir,name,size,it)) | |
69 | - | |
70 | - | |
71 | - out_file_asr=codecs.open("{}/asr_wordTopic.txt".format(output_dir),"w","utf-8") | |
72 | - out_file_trs=codecs.open("{}/trs_wordTopic.txt".format(output_dir),"w","utf-8") | |
73 | - | |
74 | - dico = train["vocab"] | |
75 | - print >>out_file_asr, ",\t".join( [ dico[x] for x in range(len(train["vocab"]))]) | |
76 | - for line in lda_asr.expElogbeta: | |
77 | - nline = line / np.sum(line) | |
78 | - print >>out_file_asr, ",\t".join( str(x) for x in nline) | |
79 | - out_file_asr.close() | |
80 | - | |
81 | - print >>out_file_trs, ",\t".join( [ dico[x] for x in range(len(train["vocab"]))]) | |
82 | - for line in lda_trs.expElogbeta: | |
83 | - nline = line / np.sum(line) | |
84 | - print >>out_file_trs, ",\t".join( str(x) for x in nline) | |
85 | - out_file_trs.close() | |
86 | - | |
87 | - K = lda_asr.num_topics | |
88 | - topicWordProbMat = lda_asr.print_topics(K,10) | |
89 | - out_file_asr=codecs.open("{}/asr_best10.txt".format(output_dir),"w","utf-8") | |
90 | - for i in topicWordProbMat: | |
91 | - print >>out_file_asr,i | |
92 | - out_file_asr.close() | |
93 | - | |
94 | - K = lda_trs.num_topics | |
95 | - topicWordProbMat = lda_trs.print_topics(K,10) | |
96 | - out_file_trs=codecs.open("{}/trs_best10.txt".format(output_dir),"w","utf-8") | |
97 | - for i in topicWordProbMat: | |
98 | - print >>out_file_trs,i | |
99 | - out_file_trs.close() | |
100 | - | |
101 | -if __name__ == "__main__": | |
102 | - logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) | |
103 | - | |
104 | - input_shelve = sys.argv[1] | |
105 | - output_dir = sys.argv[2] | |
106 | - size = [ int(x) for x in sys.argv[3].split("_")] | |
107 | - workers = int(sys.argv[4]) | |
108 | - name = sys.argv[5] | |
109 | - it = [ int(x) for x in sys.argv[6].split("_")] | |
110 | - sw_size = [ int(x) for x in sys.argv[7].split("_")] | |
111 | - alpha = ["auto" , "symmetric"] + [ float(x) for x in sys.argv[8].split("_")] | |
112 | - eta = ["auto"] + [ float(x) for x in sys.argv[9].split("_")] | |
113 | - passes = [ int(x) for x in sys.argv[10].split("_")] | |
114 | - chunk = [ int(x) for x in sys.argv[11].split("_")] | |
115 | - | |
116 | - #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir))) | |
117 | - train = shelve.open(input_shelve) | |
118 | - out_dir = "{}/{}".format(output_dir,name) | |
119 | - os.mkdir(out_dir) | |
120 | - | |
121 | - for s in size: | |
122 | - for i in it : | |
123 | - for sw in sw_size: | |
124 | - for a in alpha: | |
125 | - for e in eta: | |
126 | - for p in passes: | |
127 | - for c in chunk: | |
128 | - train_lda(out_dir,train,name,s,i,sw,a,e,p,c) |
LDA/02b-lda_order.py
1 | -import gensim | |
2 | -import os | |
3 | -import sys | |
4 | -import pickle | |
5 | -from gensim.models.ldamodel import LdaModel | |
6 | -from gensim.models.ldamulticore import LdaMulticore | |
7 | -from collections import Counter | |
8 | -import numpy as np | |
9 | -import codecs | |
10 | -import shelve | |
11 | -import logging | |
12 | -import dill | |
13 | -from tinydb import TinyDB, where, Query | |
14 | -import time | |
15 | -from joblib import Parallel, delayed | |
16 | - | |
17 | -def calc_perp(models,train): | |
18 | - | |
19 | - | |
20 | - stop_words=models[1] | |
21 | - name = models[0] | |
22 | - | |
23 | - logging.warning(" go {} ".format(name)) | |
24 | - logging.warning("TRS to be done") | |
25 | - entry = Query() | |
26 | - value=db.search(entry.name == name) | |
27 | - if len(value) > 0 : | |
28 | - logging.warning("{} already done".format(name)) | |
29 | - return | |
30 | - | |
31 | - dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] | |
32 | - lda_trs = models[2] | |
33 | - perp_trs = lda_trs.log_perplexity(dev_trs) | |
34 | - | |
35 | - logging.warning("ASR to be done") | |
36 | - dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] | |
37 | - lda_asr = models[5] | |
38 | - perp_asr = lda_asr.log_perplexity(dev_asr) | |
39 | - logging.warning("ASR saving") | |
40 | - res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs } | |
41 | - return res_dict | |
42 | - | |
43 | - | |
44 | - | |
45 | - | |
46 | -def train_lda(out_dir,train,size,it,sw_size,alpha,eta,passes,chunk): | |
47 | - name = "s{}_it{}_sw{}_a{}_e{}_p{}_c{}".format(size,it,sw_size,alpha,eta,passes,chunk) | |
48 | - logging.warning(name) | |
49 | - deep_out_dir = out_dir+"/"+name | |
50 | - if os.path.isdir(deep_out_dir): | |
51 | - logging.error(name+" already done") | |
52 | - return | |
53 | - logging.warning(name+" to be done") | |
54 | - asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) | |
55 | - trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) | |
56 | - asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] | |
57 | - trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] | |
58 | - stop_words=set(asr_sw) | set(trs_sw) | |
59 | - | |
60 | - logging.warning("TRS to be done") | |
61 | - | |
62 | - lda_trs = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=chunk,iterations=it,alpha=alpha,eta=eta,passes=passes) | |
63 | - | |
64 | - logging.warning("ASR to be done") | |
65 | - lda_asr = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=chunk,iterations=it,alpha=alpha,eta=eta,passes=passes) | |
66 | - | |
67 | - dico = train["vocab"] | |
68 | - word_list = [ dico[x] for x in range(len(train["vocab"]))] | |
69 | - asr_probs = [] | |
70 | - for line in lda_asr.expElogbeta: | |
71 | - nline = line / np.sum(line) | |
72 | - asr_probs.append([ str(x) for x in nline]) | |
73 | - trs_probs = [] | |
74 | - for line in lda_trs.expElogbeta: | |
75 | - nline = line / np.sum(line) | |
76 | - trs_probs.append([str(x) for x in nline]) | |
77 | - | |
78 | - K = lda_asr.num_topics | |
79 | - topicWordProbMat_asr = lda_asr.print_topics(K,10) | |
80 | - | |
81 | - K = lda_trs.num_topics | |
82 | - topicWordProbMat_trs = lda_trs.print_topics(K,10) | |
83 | - os.mkdir(deep_out_dir) | |
84 | - dill.dump([x for x in stop_words],open(deep_out_dir+"/stopwords.dill","w")) | |
85 | - lda_asr.save(deep_out_dir+"/lda_asr.model") | |
86 | - lda_trs.save(deep_out_dir+"/lda_trs.model") | |
87 | - dill.dump([x for x in asr_probs],open(deep_out_dir+"/lda_asr_probs.dill","w")) | |
88 | - dill.dump([x for x in trs_probs],open(deep_out_dir+"/lda_trs_probs.dill","w")) | |
89 | - | |
90 | - return [name, stop_words, lda_asr , asr_probs , topicWordProbMat_asr, lda_trs, trs_probs, topicWordProbMat_trs] | |
91 | - | |
92 | -def train_one(name,train,s,i,sw,a,e,p,c): | |
93 | - st=time.time() | |
94 | - logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]])) | |
95 | - models = train_lda(name,train,s,i,sw,a,e,p,c) | |
96 | - if models: | |
97 | - m = calc_perp(models,train) | |
98 | - #dill.dump(models,open("{}/{}.dill".format(name,models[0]),"wb")) | |
99 | - else : | |
100 | - m = None | |
101 | - e = time.time() | |
102 | - logging.warning("fin en : {}".format(e-st)) | |
103 | - return m | |
104 | - | |
105 | - | |
106 | - | |
107 | - | |
108 | -if __name__ == "__main__": | |
109 | - logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) | |
110 | - | |
111 | - input_shelve = sys.argv[1] | |
112 | - db_path = sys.argv[2] | |
113 | - size = [ int(x) for x in sys.argv[3].split("_")] | |
114 | - workers = int(sys.argv[4]) | |
115 | - name = sys.argv[5] | |
116 | - it = [ int(x) for x in sys.argv[6].split("_")] | |
117 | - sw_size = [ int(x) for x in sys.argv[7].split("_")] | |
118 | - if sys.argv[8] != "None" : | |
119 | - alpha = [ "symmetric", "auto" ] + [ float(x) for x in sys.argv[8].split("_")] | |
120 | - eta = ["auto"] + [ float(x) for x in sys.argv[9].split("_")] | |
121 | - else : | |
122 | - alpha = ["symmetric"] | |
123 | - eta = ["auto"] | |
124 | - passes = [ int(x) for x in sys.argv[10].split("_")] | |
125 | - chunk = [ int(x) for x in sys.argv[11].split("_")] | |
126 | - | |
127 | - #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir))) | |
128 | - train = shelve.open(input_shelve) | |
129 | - try : | |
130 | - os.mkdir(name) | |
131 | - except : | |
132 | - logging.warning(" folder already existe " ) | |
133 | - db = TinyDB(db_path) | |
134 | - nb_model = len(passes) * len(chunk) * len(it) * len(sw_size) * len(alpha) * len(eta) * len(size) | |
135 | - logging.warning(" hey will train {} models ".format(nb_model)) | |
136 | - | |
137 | - args_list=[] | |
138 | - for p in passes: | |
139 | - for c in chunk: | |
140 | - for i in it : | |
141 | - for sw in sw_size: | |
142 | - for a in alpha: | |
143 | - for e in eta: | |
144 | - for s in size: | |
145 | - args_list.append((name,train,s,i,sw,a,e,p,c)) | |
146 | - res_list= Parallel(n_jobs=15)(delayed(train_one)(*args) for args in args_list) | |
147 | - for m in res_list : | |
148 | - db.insert(m) | |
149 | - |
LDA/04b-mini_ae.py
LDA/04e-mm_vae.py
... | ... | @@ -108,7 +108,7 @@ |
108 | 108 | if save_projection: |
109 | 109 | pd = pandas.DataFrame(layer[0]) |
110 | 110 | col_count = (pd.sum(axis=0) != 0) |
111 | - pd = pd.loc[:,cyyol_count] | |
111 | + pd = pd.loc[:,col_count] | |
112 | 112 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"TRAIN") |
113 | 113 | pd = pandas.DataFrame(layer[1]) |
114 | 114 | pd = pd.loc[:,col_count] |
LDA/run.sh
LDA/vae.py
... | ... | @@ -128,8 +128,8 @@ |
128 | 128 | nb_epoch=nb_epochs, |
129 | 129 | verbose = 1, |
130 | 130 | batch_size=batch_size, |
131 | - validation_data=(x_dev, y_dev), | |
132 | - callbacks = [ZeroStopping(monitor='val_loss', thresh=0, verbose=0, mode='min')] | |
131 | + validation_data=(x_dev, y_dev) | |
132 | + #callbacks = [ZeroStopping(monitor='val_loss', thresh=0, verbose=0, mode='min')] | |
133 | 133 | ) |
134 | 134 | |
135 | 135 | # build a model to project inputs on the latent space |