00-prepross.py
2.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# coding: utf-8
# In[2]:
# Import
import pandas
# Alignement
from alignment.sequence import Sequence
from alignment.vocabulary import Vocabulary
from alignment.sequencealigner import *
import nltk
import codecs
import gensim
from scipy import sparse
import itertools
from sklearn.feature_extraction.text import CountVectorizer
import scipy.sparse
import scipy.io
from sklearn import preprocessing
import shelve
import pickle
from utils import *
# In[4]:
db=shelve.open("DECODA_list_wid.shelve",writeback=True)
# In[6]:
ASR={}
TRS={}
ASR["TRAIN"]=pandas.read_table("../ASR/corpus_TRAIN_ASR.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
ASR["DEV"]=pandas.read_table("../ASR/corpus_DEV_ASR.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
ASR["TEST"]=pandas.read_table("../ASR/corpus_TEST_ASR.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
print ASR["TEST"].iterrows()
TRS["TRAIN"]=pandas.read_table("../TRS/corpus_TRAIN_TRS.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
TRS["DEV"]=pandas.read_table("../TRS/corpus_DEV_TRS.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
TRS["TEST"]=pandas.read_table("../TRS/corpus_TEST_TRS.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
# In[7]:
def doprint(x):
#print x.encode("utf8")
pass
all_corp = [ x for x in yield_corpus([ASR["TRAIN"]]+[TRS["TRAIN"]]) ]
# In[8]:
vocab=gensim.corpora.dictionary.Dictionary(all_corp)
db["vocab"]=vocab
# In[9]:
# In[10]:
# In[10]:
ASR_wid={}
TRS_wid={}
for i in ASR.keys():
ASR_wid[i] = [ [ vocab.token2id[y] for y in x if y in vocab.token2id ] for x in yield_corpus([ASR[i]]) ]
TRS_wid[i] = [ [vocab.token2id[y] for y in x if y in vocab.token2id ] for x in yield_corpus([TRS[i]]) ]
db["ASR_wid"]=ASR_wid
db["TRS_wid"]=TRS_wid
# In[11]:
def select(elm):
return int(elm.split("_")[-1])
#z.apply(select)
for i in ASR.keys():
ASR[i]["label"]=ASR[i][1].apply(select)
TRS[i]["label"]=TRS[i][1].apply(select)
lb = preprocessing.LabelBinarizer(neg_label=0)
lb.fit(list(set(TRS["TRAIN"]['label'])))
db["LABEL"]={}
for i in ASR.keys():
db["LABEL"][i]=lb.transform(TRS[i]['label'])
db.sync()
db.close()