Blame view
W2V/00-prepross.py
2.17 KB
b6d0165d1 Initial commit |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 |
# coding: utf-8 # In[2]: # Import import pandas # Alignement from alignment.sequence import Sequence from alignment.vocabulary import Vocabulary from alignment.sequencealigner import * import nltk import codecs import gensim from scipy import sparse import itertools from sklearn.feature_extraction.text import CountVectorizer import scipy.sparse import scipy.io from sklearn import preprocessing import shelve import pickle from utils import * # In[4]: db=shelve.open("DECODA_list_wid.shelve",writeback=True) # In[6]: ASR={} TRS={} ASR["TRAIN"]=pandas.read_table("../ASR/corpus_TRAIN_ASR.srl",sep="\t",header=None,na_values=None,keep_default_na=False) ASR["DEV"]=pandas.read_table("../ASR/corpus_DEV_ASR.srl",sep="\t",header=None,na_values=None,keep_default_na=False) ASR["TEST"]=pandas.read_table("../ASR/corpus_TEST_ASR.srl",sep="\t",header=None,na_values=None,keep_default_na=False) print ASR["TEST"].iterrows() TRS["TRAIN"]=pandas.read_table("../TRS/corpus_TRAIN_TRS.srl",sep="\t",header=None,na_values=None,keep_default_na=False) TRS["DEV"]=pandas.read_table("../TRS/corpus_DEV_TRS.srl",sep="\t",header=None,na_values=None,keep_default_na=False) TRS["TEST"]=pandas.read_table("../TRS/corpus_TEST_TRS.srl",sep="\t",header=None,na_values=None,keep_default_na=False) # In[7]: def doprint(x): #print x.encode("utf8") pass all_corp = [ x for x in yield_corpus([ASR["TRAIN"]]+[TRS["TRAIN"]]) ] # In[8]: vocab=gensim.corpora.dictionary.Dictionary(all_corp) db["vocab"]=vocab # In[9]: # In[10]: # In[10]: ASR_wid={} TRS_wid={} for i in ASR.keys(): ASR_wid[i] = [ [ vocab.token2id[y] for y in x if y in vocab.token2id ] for x in yield_corpus([ASR[i]]) ] TRS_wid[i] = [ [vocab.token2id[y] for y in x if y in vocab.token2id ] for x in yield_corpus([TRS[i]]) ] db["ASR_wid"]=ASR_wid db["TRS_wid"]=TRS_wid # In[11]: def select(elm): return int(elm.split("_")[-1]) #z.apply(select) for i in ASR.keys(): ASR[i]["label"]=ASR[i][1].apply(select) TRS[i]["label"]=TRS[i][1].apply(select) lb = preprocessing.LabelBinarizer(neg_label=0) lb.fit(list(set(TRS["TRAIN"]['label']))) db["LABEL"]={} for i in ASR.keys(): db["LABEL"][i]=lb.transform(TRS[i]['label']) db.sync() db.close() |