Blame view

LDA/00-prepross.py 2.17 KB
b6d0165d1   Killian   Initial commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
  
  # coding: utf-8
  
  # In[2]:
  
  # Import
  import pandas
  # Alignement
  from alignment.sequence import Sequence
  from alignment.vocabulary import Vocabulary
  from alignment.sequencealigner import *
  import nltk
  import codecs
  import gensim
  from scipy import sparse
  import itertools
  from sklearn.feature_extraction.text import CountVectorizer
  import scipy.sparse
  import scipy.io
  from sklearn import preprocessing
  import shelve
  import pickle
  from utils import *
  
  # In[4]:
  
  db=shelve.open("DECODA_list_wid.shelve",writeback=True)
  
  # In[6]:
  
  ASR={}
  TRS={}
  ASR["TRAIN"]=pandas.read_table("../ASR/corpus_TRAIN_ASR.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
  ASR["DEV"]=pandas.read_table("../ASR/corpus_DEV_ASR.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
  ASR["TEST"]=pandas.read_table("../ASR/corpus_TEST_ASR.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
  
  print ASR["TEST"].iterrows()
  TRS["TRAIN"]=pandas.read_table("../TRS/corpus_TRAIN_TRS.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
  TRS["DEV"]=pandas.read_table("../TRS/corpus_DEV_TRS.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
  TRS["TEST"]=pandas.read_table("../TRS/corpus_TEST_TRS.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
  
  # In[7]:
  
  def doprint(x):
      #print x.encode("utf8")
      pass
  
  all_corp = [ x for x in yield_corpus([ASR["TRAIN"]]+[TRS["TRAIN"]]) ]
  
  # In[8]:
  
  vocab=gensim.corpora.dictionary.Dictionary(all_corp)
  db["vocab"]=vocab
  
  # In[9]:
  
  
  # In[10]:
  
  
  # In[10]:
  
  ASR_wid={}
  TRS_wid={}
  for i in ASR.keys():
      ASR_wid[i] = [ [ vocab.token2id[y] for y in x if y in vocab.token2id ] for x in yield_corpus([ASR[i]]) ]
      TRS_wid[i] = [ [vocab.token2id[y] for y in x if y in vocab.token2id ] for x in yield_corpus([TRS[i]]) ]
  
  
  
  db["ASR_wid"]=ASR_wid
  db["TRS_wid"]=TRS_wid
  # In[11]:
  
  def select(elm):
      return int(elm.split("_")[-1])
  #z.apply(select)
  for i in ASR.keys():
      ASR[i]["label"]=ASR[i][1].apply(select)
      TRS[i]["label"]=TRS[i][1].apply(select)
  lb = preprocessing.LabelBinarizer(neg_label=0)
  lb.fit(list(set(TRS["TRAIN"]['label'])))
  db["LABEL"]={}
  for i in ASR.keys():
          db["LABEL"][i]=lb.transform(TRS[i]['label'])
  
  db.sync()
  db.close()