Blame view

KNN_replace.py 3.6 KB
b6d0165d1   Killian   Initial commit
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
  
  # coding: utf-8
  
  # In[29]:
  
  # Import
  import itertools
  import shelve
  import pickle
  import pandas
  import numpy
  import nltk
  import codecs
  import gensim
  import scipy
  from scipy import sparse
  import scipy.sparse
  import scipy.io
  import sklearn
  from sklearn.feature_extraction.text import CountVectorizer
  import sklearn.metrics
  from sklearn.neighbors import NearestNeighbors
  from sklearn.metrics import confusion_matrix
  from sklearn import preprocessing
  from keras.models import Sequential
  from keras.layers.core import Dense, Dropout, Activation,AutoEncoder
  from keras.optimizers import SGD
  from keras.layers import containers
  from mlp import *
  import mlp
  
  
  
  
  
  # In[2]:
  
  
  
  # In[3]:
  
  
  
  # In[4]:
  
  #db=shelve.open("SPELIKE_MLP_DB.shelve",writeback=True)
  corps=shelve.open("DECODA_SPELIKE.shelve")
  # [ TRS_SPELIKE_H2_RELU', 'ASR_SPELIKE_OUT_RELU', 'ASR_SPELIKE_H2_RELU' ]
  sparse_corp=shelve.open("DECODA_sparse.shelve")
  # [ 'vocab', 'LABEL', 'TRS_SPARSE', 'ASR_SPARSE'] 
  
  scores=shelve.open("scores/KNN_SPE.shelve",writeback=True)
  
  # In[8]:
  
  ASR_sparse=sparse_corp["ASR_SPARSE"]
  TRS_sparse=sparse_corp["TRS_SPARSE"]
  
  
  # In[9]:
  
  def most_common(lst):
      return max(set(lst), key=lst.count)
  
  
  
  # In[2]:
  
  # In[6]:
  
  print "nbrs"
  nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(TRS_sparse["TRAIN"])
  # RAW ASR
  print "raw asr"
  TEST_distances, TEST_indices = nbrs.kneighbors(ASR_sparse["TEST"])
  DEV_distances, DEV_indices = nbrs.kneighbors(ASR_sparse["DEV"])
  
  x_test=scipy.sparse.csr.csr_matrix([TRS_sparse["TRAIN"][x].toarray()[0] for x in TEST_indices])
  x_dev=scipy.sparse.csr.csr_matrix([TRS_sparse["TRAIN"][x].toarray()[0] for x in DEV_indices])
  
  scores["ASR_RAW"]=mlp.train_mlp(TRS_sparse["TRAIN"].todense(),sparse_corp["LABEL"]["TRAIN"],x_dev.todense(),sparse_corp["LABEL"]["DEV"],x_test.todense(),sparse_corp["LABEL"]["TEST"],[1024,512,1024],dropouts=[0.5,0.25,0],sgd="adam",epochs=150)
  
  print "asr ae out"
  # AE OUT
  TEST_distances, TEST_indices = nbrs.kneighbors(corps["ASR_SPELIKE_OUT_RELU"]["TEST"])
  DEV_distances, DEV_indices = nbrs.kneighbors(corps["ASR_SPELIKE_OUT_RELU"]["DEV"])
  x_test=scipy.sparse.csr.csr_matrix([TRS_sparse["TRAIN"][x].toarray()[0] for x in TEST_indices])
  x_dev=scipy.sparse.csr.csr_matrix([TRS_sparse["TRAIN"][x].toarray()[0] for x in DEV_indices])
  
  
  scores["ASR_AE_OUT"]=mlp.train_mlp(TRS_sparse["TRAIN"].todense(),sparse_corp["LABEL"]["TRAIN"],x_dev.todense(),sparse_corp["LABEL"]["DEV"],x_test.todense(),sparse_corp["LABEL"]["TEST"],[1024,512,1024],dropouts=[0.5,0.25,0],sgd="adam",epochs=150)
  
  # AE H2 MLP OUT
  
  print "asr h2 MLP OUT"
  
  nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(corps["TRS_SPELIKE_H2_RELU"]["TRAIN"])
  
  
  TEST_distances, TEST_indices = nbrs.kneighbors(corps["ASR_SPELIKE_H2_RELU"]["TEST"])
  DEV_distances, DEV_indices = nbrs.kneighbors(corps["ASR_SPELIKE_H2_RELU"]["DEV"])
  x_test=numpy.array([TRS_sparse["TRAIN"][x].toarray()[0] for x in TEST_indices])
  x_dev=numpy.array([TRS_sparse["TRAIN"][x].toarray()[0] for x in DEV_indices])
  
  scores["ASR_AE_H2_MLP_OUT"]=mlp.train_mlp(TRS_sparse["TRAIN"].todense(),sparse_corp["LABEL"]["TRAIN"],x_dev,sparse_corp["LABEL"]["DEV"],x_test,sparse_corp["LABEL"]["TEST"],[1024,512,1024],dropouts=[0.5,0.25,0],sgd="adam",epochs=150)
  
  # AE HZ MLP H2 
  print "asr h2 MLP h2"
  
  x_test=numpy.array([corps["TRS_SPELIKE_H2_RELU"]["TRAIN"][x][0]  for x in TEST_indices])
  x_dev=numpy.array([corps["TRS_SPELIKE_H2_RELU"]["TRAIN"][x][0] for x in DEV_indices])
  
  scores["ASR_AE_H2_MLP_H2"]=mlp.train_mlp(corps["TRS_SPELIKE_H2_RELU"]["TRAIN"],sparse_corp["LABEL"]["TRAIN"],x_dev,sparse_corp["LABEL"]["DEV"],x_test,sparse_corp["LABEL"]["TEST"],[1024,512,1024],dropouts=[0.5,0.25,0],sgd="adam",epochs=150)