Blame view

DECODA_make_sparse_label.py 2.76 KB
b6d0165d1   Killian   Initial commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
  
  # coding: utf-8
  
  # In[2]:
  
  # Import
  import pandas
  # Alignement
  from alignment.sequence import Sequence
  from alignment.vocabulary import Vocabulary
  from alignment.sequencealigner import *
  import nltk
  import codecs
  import gensim
  from scipy import sparse
  import itertools
  from sklearn.feature_extraction.text import CountVectorizer
  import scipy.sparse
  import scipy.io
  from sklearn import preprocessing
  from keras.models import Sequential
  from keras.layers.core import Dense, Dropout, Activation,AutoEncoder
  from keras.optimizers import SGD
  from keras.layers import containers
  from mlp import *
  import mlp
  import sklearn.metrics
  import shelve
  import pickle
  from utils import *
  
  # In[4]:
  
  db=shelve.open("DECODA_sparse.shelve",writeback=True)
  
  # In[6]:
  
  ASR={}
  TRS={}
  ASR["TRAIN"]=pandas.read_table("./ASR/corpus_TRAIN_ASR.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
  ASR["DEV"]=pandas.read_table("./ASR/corpus_DEV_ASR.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
  ASR["TEST"]=pandas.read_table("./ASR/corpus_TEST_ASR.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
  
  TRS["TRAIN"]=pandas.read_table("./TRS/corpus_TRAIN_TRS.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
  TRS["DEV"]=pandas.read_table("./TRS/corpus_DEV_TRS.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
  TRS["TEST"]=pandas.read_table("./TRS/corpus_TEST_TRS.srl",sep="\t",header=None,na_values=None,keep_default_na=False)
  
  # In[7]:
  
  tok2 = nltk.RegexpTokenizer(u'''(?x)
            \d+(\.\d+)?\s*%   # les pourcentages
          | \w'               # les contractions d', l', ...
          | \w+               # les mots pleins
          | [^\w\s]           # les ponctuations
          ''')
  
  def yield_corpus(df_list):
      for corpus in df_list:
          for id,doc in corpus.iterrows():
              try:
                  yield tok2.tokenize(doc[2].decode("utf-8"))
              except:
                  print doc[2]
                  raise
  
  
  # In[8]:
  
  vocab=gensim.corpora.dictionary.Dictionary(documents=yield_corpus([ASR["TRAIN"]]+[TRS["TRAIN"]]))
  db["vocab"]=vocab
  
  # In[9]:
  
  dico=CountVectorizer(binary=True,vocabulary=vocab.values(),min_df=1,tokenizer=tok2.tokenize)
  
  
  # In[10]:
  
  
  db["vocab"]=vocab
  # In[10]:
  
  ASR_sparse={}
  TRS_sparse={}
  for i in ASR.keys():
      ASR_sparse[i]=dico.transform(ASR[i][2])
      TRS_sparse[i]=dico.transform(TRS[i][2])
  
  
  
  db["ASR_SPARSE"]=ASR_sparse
  db["TRS_SPARSE"]=TRS_sparse
  # In[11]:
  
  def select(elm):
      return int(elm.split("_")[-1])
  #z.apply(select)
  for i in ASR.keys():
      ASR[i]["label"]=ASR[i][1].apply(select)
      TRS[i]["label"]=TRS[i][1].apply(select)
  lb = preprocessing.LabelBinarizer(neg_label=0)
  lb.fit(list(set(TRS["TRAIN"]['label'])))
  db["LABEL"]={}
  for i in ASR.keys():
          db["LABEL"][i]=lb.transform(TRS[i]['label'])
  
  db.sync()
  db.close()