00-prepross.py 2.17 KB
# coding: utf-8

# In[2]:

# Import
import pandas
# Alignement
from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary
from alignment.sequencealigner import *
import nltk
import codecs
import gensim
from scipy import sparse
import itertools
from sklearn.feature_extraction.text import CountVectorizer
import scipy.sparse
import scipy.io
from sklearn import preprocessing
import shelve
import pickle
from utils import *

# In[4]:

db=shelve.open("DECODA_list_wid.shelve",writeback=True)

# In[6]:

ASR={}
TRS={}
ASR["TRAIN"]=pandas.read_table("../ASR/corpus_TRAIN_ASR.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
ASR["DEV"]=pandas.read_table("../ASR/corpus_DEV_ASR.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
ASR["TEST"]=pandas.read_table("../ASR/corpus_TEST_ASR.srl",sep="\t",header=None,na_values=None,keep_default_na=False)

print ASR["TEST"].iterrows()
TRS["TRAIN"]=pandas.read_table("../TRS/corpus_TRAIN_TRS.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
TRS["DEV"]=pandas.read_table("../TRS/corpus_DEV_TRS.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
TRS["TEST"]=pandas.read_table("../TRS/corpus_TEST_TRS.srl",sep="\t",header=None,na_values=None,keep_default_na=False)

# In[7]:

def doprint(x):
    #print x.encode("utf8")
    pass

all_corp = [ x for x in yield_corpus([ASR["TRAIN"]]+[TRS["TRAIN"]]) ]

# In[8]:

vocab=gensim.corpora.dictionary.Dictionary(all_corp)
db["vocab"]=vocab

# In[9]:


# In[10]:


# In[10]:

ASR_wid={}
TRS_wid={}
for i in ASR.keys():
    ASR_wid[i] = [ [ vocab.token2id[y] for y in x if y in vocab.token2id ] for x in yield_corpus([ASR[i]]) ]
    TRS_wid[i] = [ [vocab.token2id[y] for y in x if y in vocab.token2id ] for x in yield_corpus([TRS[i]]) ]



db["ASR_wid"]=ASR_wid
db["TRS_wid"]=TRS_wid
# In[11]:

def select(elm):
    return int(elm.split("_")[-1])
#z.apply(select)
for i in ASR.keys():
    ASR[i]["label"]=ASR[i][1].apply(select)
    TRS[i]["label"]=TRS[i][1].apply(select)
lb = preprocessing.LabelBinarizer(neg_label=0)
lb.fit(list(set(TRS["TRAIN"]['label'])))
db["LABEL"]={}
for i in ASR.keys():
        db["LABEL"][i]=lb.transform(TRS[i]['label'])

db.sync()
db.close()