Commit d1012a7a1689588ac0d1e4a716497562663c14c2
1 parent
ee9023b1c9
Exists in
master
update LDA/.py
Showing 7 changed files with 8 additions and 289 deletions Side-by-side Diff
LDA/00-mmf_make_features.py
| ... | ... | @@ -21,9 +21,9 @@ |
| 21 | 21 | data["LABEL"]= {} |
| 22 | 22 | data["LDA"] = {"ASR":{},"TRS":{}} |
| 23 | 23 | for mod in ["ASR", "TRS" ]: |
| 24 | - train = pandas.read_table("{}/{}/train_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | |
| 25 | - dev = pandas.read_table("{}/{}/dev_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | |
| 26 | - test = pandas.read_table("{}/{}/test_{}.ssv".format(input_dir, mod, level), sep=" ", header=None ) | |
| 24 | + train = pandas.read_table("{}/{}/train_{}.tab".format(input_dir, mod, level), sep=" ", header=None ) | |
| 25 | + dev = pandas.read_table("{}/{}/dev_{}.tab".format(input_dir, mod, level), sep=" ", header=None ) | |
| 26 | + test = pandas.read_table("{}/{}/test_{}.tab".format(input_dir, mod, level), sep=" ", header=None ) | |
| 27 | 27 | |
| 28 | 28 | y_train = train.iloc[:,0].apply(select) |
| 29 | 29 | y_dev = dev.iloc[:,0].apply(select) |
| 30 | 30 | |
| ... | ... | @@ -32,12 +32,12 @@ |
| 32 | 32 | data["LABEL"][mod]={"TRAIN":lb.transform(y_train),"DEV":lb.transform(y_dev), "TEST": lb.transform(y_test)} |
| 33 | 33 | |
| 34 | 34 | # data["LDA"][mod]={'ASR':[]} |
| 35 | - print data["LDA"][mod] | |
| 36 | 35 | print train.values |
| 37 | 36 | data["LDA"][mod]["TRAIN"]=train.iloc[:,1:-1].values |
| 38 | 37 | data["LDA"][mod]["DEV"]=dev.iloc[:,1:-1].values |
| 39 | 38 | data["LDA"][mod]["TEST"]=test.iloc[:,1:-1].values |
| 40 | 39 | |
| 40 | + print data["LDA"][mod]["TRAIN"].shape | |
| 41 | 41 | data.sync() |
| 42 | 42 | data.close() |
LDA/02-lda_split.py
| 1 | -import gensim | |
| 2 | -import os | |
| 3 | -import sys | |
| 4 | -import pickle | |
| 5 | -from gensim.models.ldamodel import LdaModel | |
| 6 | -from gensim.models.ldamulticore import LdaMulticore | |
| 7 | -from collections import Counter | |
| 8 | -import numpy as np | |
| 9 | -import codecs | |
| 10 | -import shelve | |
| 11 | -import logging | |
| 12 | - | |
| 13 | -def calc_perp(in_dir,train): | |
| 14 | - name = in_dir.split("/")[-1] | |
| 15 | - # s40_it1_sw50_a0.01_e0.1_p6_c1000 | |
| 16 | - sw_size = int(name.split("_")[2][2:]) | |
| 17 | - | |
| 18 | - logging.warning(" go {} ".format(name)) | |
| 19 | - | |
| 20 | - | |
| 21 | - logging.warning("Redo Vocab and stop") | |
| 22 | - asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) | |
| 23 | - trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) | |
| 24 | - asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] | |
| 25 | - trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] | |
| 26 | - stop_words=set(asr_sw) | set(trs_sw) | |
| 27 | - | |
| 28 | - logging.warning("TRS to be done") | |
| 29 | - entry = Query() | |
| 30 | - value=db.search(entry.name == name) | |
| 31 | - if len(value) > 0 : | |
| 32 | - logging.warning("{} already done".format(name)) | |
| 33 | - return | |
| 34 | - | |
| 35 | - dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] | |
| 36 | - lda_trs = LdaModel.load("{}/lda_trs.model".format(in_dir)) | |
| 37 | - perp_trs = lda_trs.log_perplexity(dev_trs) | |
| 38 | - logging.warning("ASR to be done") | |
| 39 | - dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] | |
| 40 | - lda_asr = LdaModel.load("{}/lda_asr.model".format(in_dir)) | |
| 41 | - perp_asr = lda_asr.log_perplexity(dev_asr) | |
| 42 | - logging.warning("ASR saving") | |
| 43 | - res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs} | |
| 44 | - return res_dict | |
| 45 | - | |
| 46 | - | |
| 47 | - | |
| 48 | - | |
| 49 | -def train_lda(out_dir,train,name,size,it,sw_size,alpha,eta,passes,chunk): | |
| 50 | - output_dir = "{}/s{}_it{}_sw{}_a{}_e{}_p{}_c{}".format(out_dir,size,it,sw_size,alpha,eta,passes,chunk) | |
| 51 | - os.mkdir(output_dir) | |
| 52 | - logging.info(output_dir+" to be done") | |
| 53 | - asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) | |
| 54 | - trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) | |
| 55 | - asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] | |
| 56 | - trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] | |
| 57 | - stop_words=set(asr_sw) | set(trs_sw) | |
| 58 | - | |
| 59 | - logging.info("TRS to be done") | |
| 60 | - | |
| 61 | - lda_trs = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=1000,iterations=it) | |
| 62 | - | |
| 63 | - logging.info("ASR to be done") | |
| 64 | - lda_asr = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=1000,iterations=it) | |
| 65 | - | |
| 66 | - #logger.info("ASR saving") | |
| 67 | - #lda_asr.save("{}/lda_asr.model".format(output_dir,name,size,it)) | |
| 68 | - #lda_trs.save("{}/lda_trs.model".format(output_dir,name,size,it)) | |
| 69 | - | |
| 70 | - | |
| 71 | - out_file_asr=codecs.open("{}/asr_wordTopic.txt".format(output_dir),"w","utf-8") | |
| 72 | - out_file_trs=codecs.open("{}/trs_wordTopic.txt".format(output_dir),"w","utf-8") | |
| 73 | - | |
| 74 | - dico = train["vocab"] | |
| 75 | - print >>out_file_asr, ",\t".join( [ dico[x] for x in range(len(train["vocab"]))]) | |
| 76 | - for line in lda_asr.expElogbeta: | |
| 77 | - nline = line / np.sum(line) | |
| 78 | - print >>out_file_asr, ",\t".join( str(x) for x in nline) | |
| 79 | - out_file_asr.close() | |
| 80 | - | |
| 81 | - print >>out_file_trs, ",\t".join( [ dico[x] for x in range(len(train["vocab"]))]) | |
| 82 | - for line in lda_trs.expElogbeta: | |
| 83 | - nline = line / np.sum(line) | |
| 84 | - print >>out_file_trs, ",\t".join( str(x) for x in nline) | |
| 85 | - out_file_trs.close() | |
| 86 | - | |
| 87 | - K = lda_asr.num_topics | |
| 88 | - topicWordProbMat = lda_asr.print_topics(K,10) | |
| 89 | - out_file_asr=codecs.open("{}/asr_best10.txt".format(output_dir),"w","utf-8") | |
| 90 | - for i in topicWordProbMat: | |
| 91 | - print >>out_file_asr,i | |
| 92 | - out_file_asr.close() | |
| 93 | - | |
| 94 | - K = lda_trs.num_topics | |
| 95 | - topicWordProbMat = lda_trs.print_topics(K,10) | |
| 96 | - out_file_trs=codecs.open("{}/trs_best10.txt".format(output_dir),"w","utf-8") | |
| 97 | - for i in topicWordProbMat: | |
| 98 | - print >>out_file_trs,i | |
| 99 | - out_file_trs.close() | |
| 100 | - | |
| 101 | -if __name__ == "__main__": | |
| 102 | - logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) | |
| 103 | - | |
| 104 | - input_shelve = sys.argv[1] | |
| 105 | - output_dir = sys.argv[2] | |
| 106 | - size = [ int(x) for x in sys.argv[3].split("_")] | |
| 107 | - workers = int(sys.argv[4]) | |
| 108 | - name = sys.argv[5] | |
| 109 | - it = [ int(x) for x in sys.argv[6].split("_")] | |
| 110 | - sw_size = [ int(x) for x in sys.argv[7].split("_")] | |
| 111 | - alpha = ["auto" , "symmetric"] + [ float(x) for x in sys.argv[8].split("_")] | |
| 112 | - eta = ["auto"] + [ float(x) for x in sys.argv[9].split("_")] | |
| 113 | - passes = [ int(x) for x in sys.argv[10].split("_")] | |
| 114 | - chunk = [ int(x) for x in sys.argv[11].split("_")] | |
| 115 | - | |
| 116 | - #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir))) | |
| 117 | - train = shelve.open(input_shelve) | |
| 118 | - out_dir = "{}/{}".format(output_dir,name) | |
| 119 | - os.mkdir(out_dir) | |
| 120 | - | |
| 121 | - for s in size: | |
| 122 | - for i in it : | |
| 123 | - for sw in sw_size: | |
| 124 | - for a in alpha: | |
| 125 | - for e in eta: | |
| 126 | - for p in passes: | |
| 127 | - for c in chunk: | |
| 128 | - train_lda(out_dir,train,name,s,i,sw,a,e,p,c) |
LDA/02b-lda_order.py
| 1 | -import gensim | |
| 2 | -import os | |
| 3 | -import sys | |
| 4 | -import pickle | |
| 5 | -from gensim.models.ldamodel import LdaModel | |
| 6 | -from gensim.models.ldamulticore import LdaMulticore | |
| 7 | -from collections import Counter | |
| 8 | -import numpy as np | |
| 9 | -import codecs | |
| 10 | -import shelve | |
| 11 | -import logging | |
| 12 | -import dill | |
| 13 | -from tinydb import TinyDB, where, Query | |
| 14 | -import time | |
| 15 | -from joblib import Parallel, delayed | |
| 16 | - | |
| 17 | -def calc_perp(models,train): | |
| 18 | - | |
| 19 | - | |
| 20 | - stop_words=models[1] | |
| 21 | - name = models[0] | |
| 22 | - | |
| 23 | - logging.warning(" go {} ".format(name)) | |
| 24 | - logging.warning("TRS to be done") | |
| 25 | - entry = Query() | |
| 26 | - value=db.search(entry.name == name) | |
| 27 | - if len(value) > 0 : | |
| 28 | - logging.warning("{} already done".format(name)) | |
| 29 | - return | |
| 30 | - | |
| 31 | - dev_trs=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["DEV"]] | |
| 32 | - lda_trs = models[2] | |
| 33 | - perp_trs = lda_trs.log_perplexity(dev_trs) | |
| 34 | - | |
| 35 | - logging.warning("ASR to be done") | |
| 36 | - dev_asr = [ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["DEV"]] | |
| 37 | - lda_asr = models[5] | |
| 38 | - perp_asr = lda_asr.log_perplexity(dev_asr) | |
| 39 | - logging.warning("ASR saving") | |
| 40 | - res_dict = {"name" : name, "asr" : perp_asr, "trs" : perp_trs } | |
| 41 | - return res_dict | |
| 42 | - | |
| 43 | - | |
| 44 | - | |
| 45 | - | |
| 46 | -def train_lda(out_dir,train,size,it,sw_size,alpha,eta,passes,chunk): | |
| 47 | - name = "s{}_it{}_sw{}_a{}_e{}_p{}_c{}".format(size,it,sw_size,alpha,eta,passes,chunk) | |
| 48 | - logging.warning(name) | |
| 49 | - deep_out_dir = out_dir+"/"+name | |
| 50 | - if os.path.isdir(deep_out_dir): | |
| 51 | - logging.error(name+" already done") | |
| 52 | - return | |
| 53 | - logging.warning(name+" to be done") | |
| 54 | - asr_count=Counter([ x for y in train["ASR_wid"]["TRAIN"] for x in y]) | |
| 55 | - trs_count=Counter([ x for y in train["TRS_wid"]["TRAIN"] for x in y]) | |
| 56 | - asr_sw = [ x[0] for x in asr_count.most_common(sw_size) ] | |
| 57 | - trs_sw = [ x[0] for x in trs_count.most_common(sw_size) ] | |
| 58 | - stop_words=set(asr_sw) | set(trs_sw) | |
| 59 | - | |
| 60 | - logging.warning("TRS to be done") | |
| 61 | - | |
| 62 | - lda_trs = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["TRS_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=chunk,iterations=it,alpha=alpha,eta=eta,passes=passes) | |
| 63 | - | |
| 64 | - logging.warning("ASR to be done") | |
| 65 | - lda_asr = LdaModel(corpus=[ [ (x,y) for x,y in Counter(z).items() if x not in stop_words] for z in train["ASR_wid"]["TRAIN"]], id2word=train["vocab"], num_topics=int(size), chunksize=chunk,iterations=it,alpha=alpha,eta=eta,passes=passes) | |
| 66 | - | |
| 67 | - dico = train["vocab"] | |
| 68 | - word_list = [ dico[x] for x in range(len(train["vocab"]))] | |
| 69 | - asr_probs = [] | |
| 70 | - for line in lda_asr.expElogbeta: | |
| 71 | - nline = line / np.sum(line) | |
| 72 | - asr_probs.append([ str(x) for x in nline]) | |
| 73 | - trs_probs = [] | |
| 74 | - for line in lda_trs.expElogbeta: | |
| 75 | - nline = line / np.sum(line) | |
| 76 | - trs_probs.append([str(x) for x in nline]) | |
| 77 | - | |
| 78 | - K = lda_asr.num_topics | |
| 79 | - topicWordProbMat_asr = lda_asr.print_topics(K,10) | |
| 80 | - | |
| 81 | - K = lda_trs.num_topics | |
| 82 | - topicWordProbMat_trs = lda_trs.print_topics(K,10) | |
| 83 | - os.mkdir(deep_out_dir) | |
| 84 | - dill.dump([x for x in stop_words],open(deep_out_dir+"/stopwords.dill","w")) | |
| 85 | - lda_asr.save(deep_out_dir+"/lda_asr.model") | |
| 86 | - lda_trs.save(deep_out_dir+"/lda_trs.model") | |
| 87 | - dill.dump([x for x in asr_probs],open(deep_out_dir+"/lda_asr_probs.dill","w")) | |
| 88 | - dill.dump([x for x in trs_probs],open(deep_out_dir+"/lda_trs_probs.dill","w")) | |
| 89 | - | |
| 90 | - return [name, stop_words, lda_asr , asr_probs , topicWordProbMat_asr, lda_trs, trs_probs, topicWordProbMat_trs] | |
| 91 | - | |
| 92 | -def train_one(name,train,s,i,sw,a,e,p,c): | |
| 93 | - st=time.time() | |
| 94 | - logging.warning(" ; ".join([str(x) for x in [s,i,sw,a,e,p,c]])) | |
| 95 | - models = train_lda(name,train,s,i,sw,a,e,p,c) | |
| 96 | - if models: | |
| 97 | - m = calc_perp(models,train) | |
| 98 | - #dill.dump(models,open("{}/{}.dill".format(name,models[0]),"wb")) | |
| 99 | - else : | |
| 100 | - m = None | |
| 101 | - e = time.time() | |
| 102 | - logging.warning("fin en : {}".format(e-st)) | |
| 103 | - return m | |
| 104 | - | |
| 105 | - | |
| 106 | - | |
| 107 | - | |
| 108 | -if __name__ == "__main__": | |
| 109 | - logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.WARNING) | |
| 110 | - | |
| 111 | - input_shelve = sys.argv[1] | |
| 112 | - db_path = sys.argv[2] | |
| 113 | - size = [ int(x) for x in sys.argv[3].split("_")] | |
| 114 | - workers = int(sys.argv[4]) | |
| 115 | - name = sys.argv[5] | |
| 116 | - it = [ int(x) for x in sys.argv[6].split("_")] | |
| 117 | - sw_size = [ int(x) for x in sys.argv[7].split("_")] | |
| 118 | - if sys.argv[8] != "None" : | |
| 119 | - alpha = [ "symmetric", "auto" ] + [ float(x) for x in sys.argv[8].split("_")] | |
| 120 | - eta = ["auto"] + [ float(x) for x in sys.argv[9].split("_")] | |
| 121 | - else : | |
| 122 | - alpha = ["symmetric"] | |
| 123 | - eta = ["auto"] | |
| 124 | - passes = [ int(x) for x in sys.argv[10].split("_")] | |
| 125 | - chunk = [ int(x) for x in sys.argv[11].split("_")] | |
| 126 | - | |
| 127 | - #train=pickle.load(open("{}/newsgroup_bow_train.pk".format(input_dir))) | |
| 128 | - train = shelve.open(input_shelve) | |
| 129 | - try : | |
| 130 | - os.mkdir(name) | |
| 131 | - except : | |
| 132 | - logging.warning(" folder already existe " ) | |
| 133 | - db = TinyDB(db_path) | |
| 134 | - nb_model = len(passes) * len(chunk) * len(it) * len(sw_size) * len(alpha) * len(eta) * len(size) | |
| 135 | - logging.warning(" hey will train {} models ".format(nb_model)) | |
| 136 | - | |
| 137 | - args_list=[] | |
| 138 | - for p in passes: | |
| 139 | - for c in chunk: | |
| 140 | - for i in it : | |
| 141 | - for sw in sw_size: | |
| 142 | - for a in alpha: | |
| 143 | - for e in eta: | |
| 144 | - for s in size: | |
| 145 | - args_list.append((name,train,s,i,sw,a,e,p,c)) | |
| 146 | - res_list= Parallel(n_jobs=15)(delayed(train_one)(*args) for args in args_list) | |
| 147 | - for m in res_list : | |
| 148 | - db.insert(m) | |
| 149 | - |
LDA/04b-mini_ae.py
LDA/04e-mm_vae.py
| ... | ... | @@ -108,7 +108,7 @@ |
| 108 | 108 | if save_projection: |
| 109 | 109 | pd = pandas.DataFrame(layer[0]) |
| 110 | 110 | col_count = (pd.sum(axis=0) != 0) |
| 111 | - pd = pd.loc[:,cyyol_count] | |
| 111 | + pd = pd.loc[:,col_count] | |
| 112 | 112 | pd.to_hdf("{}/{}/VAE_{}_{}_df.hdf".format(in_dir,name,nb,mod),"TRAIN") |
| 113 | 113 | pd = pandas.DataFrame(layer[1]) |
| 114 | 114 | pd = pd.loc[:,col_count] |
LDA/run.sh
LDA/vae.py
| ... | ... | @@ -128,8 +128,8 @@ |
| 128 | 128 | nb_epoch=nb_epochs, |
| 129 | 129 | verbose = 1, |
| 130 | 130 | batch_size=batch_size, |
| 131 | - validation_data=(x_dev, y_dev), | |
| 132 | - callbacks = [ZeroStopping(monitor='val_loss', thresh=0, verbose=0, mode='min')] | |
| 131 | + validation_data=(x_dev, y_dev) | |
| 132 | + #callbacks = [ZeroStopping(monitor='val_loss', thresh=0, verbose=0, mode='min')] | |
| 133 | 133 | ) |
| 134 | 134 | |
| 135 | 135 | # build a model to project inputs on the latent space |