Commit d1012a7a1689588ac0d1e4a716497562663c14c2

Authored by Killian
1 parent ee9023b1c9
Exists in master

update LDA/.py

Showing 7 changed files with 8 additions and 289 deletions Side-by-side Diff

LDA/00-mmf_make_features.py
... ... @@ -21,9 +21,9 @@
21 21 data["LABEL"]= {}
22 22 data["LDA"] = {"ASR":{},"TRS":{}}
23 23 for mod in ["ASR", "TRS" ]:
24   - train = pandas.read_table("{}/{}/train_{}.ssv".format(input_dir, mod, level), sep=" ", header=None )
25   - dev = pandas.read_table("{}/{}/dev_{}.ssv".format(input_dir, mod, level), sep=" ", header=None )
26   - test = pandas.read_table("{}/{}/test_{}.ssv".format(input_dir, mod, level), sep=" ", header=None )
  24 + train = pandas.read_table("{}/{}/train_{}.tab".format(input_dir, mod, level), sep=" ", header=None )
  25 + dev = pandas.read_table("{}/{}/dev_{}.tab".format(input_dir, mod, level), sep=" ", header=None )
  26 + test = pandas.read_table("{}/{}/test_{}.tab".format(input_dir, mod, level), sep=" ", header=None )
27 27  
28 28 y_train = train.iloc[:,0].apply(select)
29 29 y_dev = dev.iloc[:,0].apply(select)
30 30  
... ... @@ -32,12 +32,12 @@
32 32 data["LABEL"][mod]={"TRAIN":lb.transform(y_train),"DEV":lb.transform(y_dev), "TEST": lb.transform(y_test)}
33 33  
34 34 # data["LDA"][mod]={'ASR':[]}
35   - print data["LDA"][mod]
36 35 print train.values
37 36 data["LDA"][mod]["TRAIN"]=train.iloc[:,1:-1].values
38 37 data["LDA"][mod]["DEV"]=dev.iloc[:,1:-1].values
39 38 data["LDA"][mod]["TEST"]=test.iloc[:,1:-1].values
40 39  
  40 + print data["LDA"][mod]["TRAIN"].shape
41 41 data.sync()
42 42 data.close()
LDA/02-lda_split.py
1   -import gensim
2   -import os
3   -import sys
4   -import pickle
5   -from gensim.models.ldamodel import LdaModel
6   -from gensim.models.ldamulticore import LdaMulticore
7   -from collections import Counter
8   -import numpy as np
9   -import codecs
10   -import shelve
11   -import logging
12   -
13   -def calc_perp(in_dir,train):
14   - name = in_dir.split("/")[-1]
15   - # s40_it1_sw50_a0.01_e0.1_p6_c1000
16   - sw_size = int(name.split("_")[2][2:])
17   -
18   - logging.warning(" go {} ".format(name))
19   -
20   -
21   - logging.warning("Redo Vocab and stop")
22   - asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y])
23   - trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y])
24   - asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ]
25   - trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ]
26   - stop_words=set(asr_sw) | set(trs_sw)
27   -
28   - logging.warning("TRS to be done")
29   - entry = Query()
30   - value=db.search(entry.name == name)
31   - if len(value) > 0 :
32   - logging.warning("{} already done".format(name))
33   - return
34   -
35   - dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]]
36   - lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir))
37   - perp_trs = lda_trs.log_perplexity(dev_trs)
38   - logging.warning("ASR to be done")
39   - dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]]
40   - lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir))
41   - perp_asr = lda_asr.log_perplexity(dev_asr)
42   - logging.warning("ASR saving")
43   - res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs}
44   - return res_dict
45   -
46   -
47   -
48   -
49   -def train_lda(out_dir,train,name,size,it,sw_size,alpha,eta,passes,chunk):
50   - output_dir = "{}/s{}_it{}_sw{}_a{}_e{}_p{}_c{}".format(out_dir,size,it,sw_size,alpha,eta,passes,chunk)
51   - os.mkdir(output_dir)
52   - logging.info(output_dir+" to be done")
53   - asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y])
54   - trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y])
55   - asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ]
56   - trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ]
57   - stop_words=set(asr_sw) | set(trs_sw)
58   -
59   - logging.info("TRS to be done")
60   -
61   - lda_trs = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=1000,iterations=it)
62   -
63   - logging.info("ASR to be done")
64   - lda_asr = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=1000,iterations=it)
65   -
66   - #logger.info("ASR saving")
67   - #lda_asr.save("{}/lda_asr.model".format(output_dir,name,size,it))
68   - #lda_trs.save("{}/lda_trs.model".format(output_dir,name,size,it))
69   -
70   -
71   - out_file_asr=codecs.open("{}/asr_wordTopic.txt".format(output_dir),"w","utf-8")
72   - out_file_trs=codecs.open("{}/trs_wordTopic.txt".format(output_dir),"w","utf-8")
73   -
74   - dico = train["vocab"]
75   - print >>out_file_asr, ",\t".join( [ dico[x] for x in range(len(train["vocab"]))])
76   - for line in lda_asr.expElogbeta:
77   - nline = line / np.sum(line)
78   - print >>out_file_asr, ",\t".join( str(x) for x in nline)
79   - out_file_asr.close()
80   -
81   - print >>out_file_trs, ",\t".join( [ dico[x] for x in range(len(train["vocab"]))])
82   - for line in lda_trs.expElogbeta:
83   - nline = line / np.sum(line)
84   - print >>out_file_trs, ",\t".join( str(x) for x in nline)
85   - out_file_trs.close()
86   -
87   - K = lda_asr.num_topics
88   - topicWordProbMat = lda_asr.print_topics(K,10)
89   - out_file_asr=codecs.open("{}/asr_best10.txt".format(output_dir),"w","utf-8")
90   - for i in topicWordProbMat:
91   - print >>out_file_asr,i
92   - out_file_asr.close()
93   -
94   - K = lda_trs.num_topics
95   - topicWordProbMat = lda_trs.print_topics(K,10)
96   - out_file_trs=codecs.open("{}/trs_best10.txt".format(output_dir),"w","utf-8")
97   - for i in topicWordProbMat:
98   - print >>out_file_trs,i
99   - out_file_trs.close()
100   -
101   -if __name__ == "__main__":
102   - logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
103   -
104   - input_shelve = sys.argv[1]
105   - output_dir = sys.argv[2]
106   - size = [ int(x) for x in sys.argv[3].split("_")]
107   - workers = int(sys.argv[4])
108   - name = sys.argv[5]
109   - it = [ int(x) for x in sys.argv[6].split("_")]
110   - sw_size = [ int(x) for x in sys.argv[7].split("_")]
111   - alpha = ["auto" , "symmetric"] + [ float(x) for x in sys.argv[8].split("_")]
112   - eta = ["auto"] + [ float(x) for x in sys.argv[9].split("_")]
113   - passes = [ int(x) for x in sys.argv[10].split("_")]
114   - chunk = [ int(x) for x in sys.argv[11].split("_")]
115   -
116   - #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir)))
117   - train = shelve.open(input_shelve)
118   - out_dir = "{}/{}".format(output_dir,name)
119   - os.mkdir(out_dir)
120   -
121   - for s in size:
122   - for i in it :
123   - for sw in sw_size:
124   - for a in alpha:
125   - for e in eta:
126   - for p in passes:
127   - for c in chunk:
128   - train_lda(out_dir,train,name,s,i,sw,a,e,p,c)
LDA/02b-lda_order.py
1   -import gensim
2   -import os
3   -import sys
4   -import pickle
5   -from gensim.models.ldamodel import LdaModel
6   -from gensim.models.ldamulticore import LdaMulticore
7   -from collections import Counter
8   -import numpy as np
9   -import codecs
10   -import shelve
11   -import logging
12   -import dill
13   -from tinydb import TinyDB, where, Query
14   -import time
15   -from joblib import Parallel, delayed
16   -
17   -def calc_perp(models,train):
18   -
19   -
20   - stop_words=models[1]
21   - name = models[0]
22   -
23   - logging.warning(" go {} ".format(name))
24   - logging.warning("TRS to be done")
25   - entry = Query()
26   - value=db.search(entry.name == name)
27   - if len(value) > 0 :
28   - logging.warning("{} already done".format(name))
29   - return
30   -
31   - dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]]
32   - lda_trs = models[2]
33   - perp_trs = lda_trs.log_perplexity(dev_trs)
34   -
35   - logging.warning("ASR to be done")
36   - dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]]
37   - lda_asr = models[5]
38   - perp_asr = lda_asr.log_perplexity(dev_asr)
39   - logging.warning("ASR saving")
40   - res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs }
41   - return res_dict
42   -
43   -
44   -
45   -
46   -def train_lda(out_dir,train,size,it,sw_size,alpha,eta,passes,chunk):
47   - name = "s{}_it{}_sw{}_a{}_e{}_p{}_c{}".format(size,it,sw_size,alpha,eta,passes,chunk)
48   - logging.warning(name)
49   - deep_out_dir = out_dir+"/"+name
50   - if os.path.isdir(deep_out_dir):
51   - logging.error(name+" already done")
52   - return
53   - logging.warning(name+" to be done")
54   - asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y])
55   - trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y])
56   - asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ]
57   - trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ]
58   - stop_words=set(asr_sw) | set(trs_sw)
59   -
60   - logging.warning("TRS to be done")
61   -
62   - lda_trs = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=chunk,iterations=it,alpha=alpha,eta=eta,passes=passes)
63   -
64   - logging.warning("ASR to be done")
65   - lda_asr = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=chunk,iterations=it,alpha=alpha,eta=eta,passes=passes)
66   -
67   - dico = train["vocab"]
68   - word_list = [ dico[x] for x in range(len(train["vocab"]))]
69   - asr_probs = []
70   - for line in lda_asr.expElogbeta:
71   - nline = line / np.sum(line)
72   - asr_probs.append([ str(x) for x in nline])
73   - trs_probs = []
74   - for line in lda_trs.expElogbeta:
75   - nline = line / np.sum(line)
76   - trs_probs.append([str(x) for x in nline])
77   -
78   - K = lda_asr.num_topics
79   - topicWordProbMat_asr = lda_asr.print_topics(K,10)
80   -
81   - K = lda_trs.num_topics
82   - topicWordProbMat_trs = lda_trs.print_topics(K,10)
83   - os.mkdir(deep_out_dir)
84   - dill.dump([x for x in stop_words],open(deep_out_dir+"/stopwords.dill","w"))
85   - lda_asr.save(deep_out_dir+"/lda_asr.model")
86   - lda_trs.save(deep_out_dir+"/lda_trs.model")
87   - dill.dump([x for x in asr_probs],open(deep_out_dir+"/lda_asr_probs.dill","w"))
88   - dill.dump([x for x in trs_probs],open(deep_out_dir+"/lda_trs_probs.dill","w"))
89   -
90   - return [name, stop_words, lda_asr , asr_probs , topicWordProbMat_asr, lda_trs, trs_probs, topicWordProbMat_trs]
91   -
92   -def train_one(name,train,s,i,sw,a,e,p,c):
93   - st=time.time()
94   - logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]]))
95   - models = train_lda(name,train,s,i,sw,a,e,p,c)
96   - if models:
97   - m = calc_perp(models,train)
98   - #dill.dump(models,open("{}/{}.dill".format(name,models[0]),"wb"))
99   - else :
100   - m = None
101   - e = time.time()
102   - logging.warning("fin en : {}".format(e-st))
103   - return m
104   -
105   -
106   -
107   -
108   -if __name__ == "__main__":
109   - logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING)
110   -
111   - input_shelve = sys.argv[1]
112   - db_path = sys.argv[2]
113   - size = [ int(x) for x in sys.argv[3].split("_")]
114   - workers = int(sys.argv[4])
115   - name = sys.argv[5]
116   - it = [ int(x) for x in sys.argv[6].split("_")]
117   - sw_size = [ int(x) for x in sys.argv[7].split("_")]
118   - if sys.argv[8] != "None" :
119   - alpha = [ "symmetric", "auto" ] + [ float(x) for x in sys.argv[8].split("_")]
120   - eta = ["auto"] + [ float(x) for x in sys.argv[9].split("_")]
121   - else :
122   - alpha = ["symmetric"]
123   - eta = ["auto"]
124   - passes = [ int(x) for x in sys.argv[10].split("_")]
125   - chunk = [ int(x) for x in sys.argv[11].split("_")]
126   -
127   - #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir)))
128   - train = shelve.open(input_shelve)
129   - try :
130   - os.mkdir(name)
131   - except :
132   - logging.warning(" folder already existe " )
133   - db = TinyDB(db_path)
134   - nb_model = len(passes) * len(chunk) * len(it) * len(sw_size) * len(alpha) * len(eta) * len(size)
135   - logging.warning(" hey will train {} models ".format(nb_model))
136   -
137   - args_list=[]
138   - for p in passes:
139   - for c in chunk:
140   - for i in it :
141   - for sw in sw_size:
142   - for a in alpha:
143   - for e in eta:
144   - for s in size:
145   - args_list.append((name,train,s,i,sw,a,e,p,c))
146   - res_list= Parallel(n_jobs=15)(delayed(train_one)(*args) for args in args_list)
147   - for m in res_list :
148   - db.insert(m)
149   -
... ... @@ -69,7 +69,7 @@
69 69  
70 70 keys = ["ASR","TRS"]
71 71  
72   -mlp_h = [ 40 , 25 , 40]
  72 +mlp_h = [ 512 , 1024 , 2048]
73 73 mlp_loss ="categorical_crossentropy"
74 74 mlp_dropouts = [0,0,0,0]
75 75 mlp_sgd = Adam(0.0001)
... ... @@ -108,7 +108,7 @@
108 108 if save_projection:
109 109 pd = pandas.DataFrame(layer[0])
110 110 col_count = (pd.sum(axis=0) != 0)
111   - pd = pd.loc[:,cyyol_count]
  111 + pd = pd.loc[:,col_count]
112 112 pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"TRAIN")
113 113 pd = pandas.DataFrame(layer[1])
114 114 pd = pd.loc[:,col_count]
1   -python 00-prepross.py
2   -python 02-lda_split.py DECODA_list_wid.shelve output_v1/ 100 12 test2 1 400
3   -python 03-mono_perplex.py DECODA_list_wid.shelve output_v1/test2 output_v1/t2db.json
... ... @@ -128,8 +128,8 @@
128 128 nb_epoch=nb_epochs,
129 129 verbose = 1,
130 130 batch_size=batch_size,
131   - validation_data=(x_dev, y_dev),
132   - callbacks = [ZeroStopping(monitor='val_loss', thresh=0, verbose=0, mode='min')]
  131 + validation_data=(x_dev, y_dev)
  132 + #callbacks = [ZeroStopping(monitor='val_loss', thresh=0, verbose=0, mode='min')]
133 133 )
134 134  
135 135 # build a model to project inputs on the latent space