KNN_replace.py
3.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# coding: utf-8
# In[29]:
# Import
import itertools
import shelve
import pickle
import pandas
import numpy
import nltk
import codecs
import gensim
import scipy
from scipy import sparse
import scipy.sparse
import scipy.io
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
import sklearn.metrics
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import confusion_matrix
from sklearn import preprocessing
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation,AutoEncoder
from keras.optimizers import SGD
from keras.layers import containers
from mlp import *
import mlp
# In[2]:
# In[3]:
# In[4]:
#db=shelve.open("SPELIKE_MLP_DB.shelve",writeback=True)
corps=shelve.open("DECODA_SPELIKE.shelve")
# [ TRS_SPELIKE_H2_RELU', 'ASR_SPELIKE_OUT_RELU', 'ASR_SPELIKE_H2_RELU' ]
sparse_corp=shelve.open("DECODA_sparse.shelve")
# [ 'vocab', 'LABEL', 'TRS_SPARSE', 'ASR_SPARSE']
scores=shelve.open("scores/KNN_SPE.shelve",writeback=True)
# In[8]:
ASR_sparse=sparse_corp["ASR_SPARSE"]
TRS_sparse=sparse_corp["TRS_SPARSE"]
# In[9]:
def most_common(lst):
return max(set(lst), key=lst.count)
# In[2]:
# In[6]:
print "nbrs"
nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(TRS_sparse["TRAIN"])
# RAW ASR
print "raw asr"
TEST_distances, TEST_indices = nbrs.kneighbors(ASR_sparse["TEST"])
DEV_distances, DEV_indices = nbrs.kneighbors(ASR_sparse["DEV"])
x_test=scipy.sparse.csr.csr_matrix([TRS_sparse["TRAIN"][x].toarray()[0] for x in TEST_indices])
x_dev=scipy.sparse.csr.csr_matrix([TRS_sparse["TRAIN"][x].toarray()[0] for x in DEV_indices])
scores["ASR_RAW"]=mlp.train_mlp(TRS_sparse["TRAIN"].todense(),sparse_corp["LABEL"]["TRAIN"],x_dev.todense(),sparse_corp["LABEL"]["DEV"],x_test.todense(),sparse_corp["LABEL"]["TEST"],[1024,512,1024],dropouts=[0.5,0.25,0],sgd="adam",epochs=150)
print "asr ae out"
# AE OUT
TEST_distances, TEST_indices = nbrs.kneighbors(corps["ASR_SPELIKE_OUT_RELU"]["TEST"])
DEV_distances, DEV_indices = nbrs.kneighbors(corps["ASR_SPELIKE_OUT_RELU"]["DEV"])
x_test=scipy.sparse.csr.csr_matrix([TRS_sparse["TRAIN"][x].toarray()[0] for x in TEST_indices])
x_dev=scipy.sparse.csr.csr_matrix([TRS_sparse["TRAIN"][x].toarray()[0] for x in DEV_indices])
scores["ASR_AE_OUT"]=mlp.train_mlp(TRS_sparse["TRAIN"].todense(),sparse_corp["LABEL"]["TRAIN"],x_dev.todense(),sparse_corp["LABEL"]["DEV"],x_test.todense(),sparse_corp["LABEL"]["TEST"],[1024,512,1024],dropouts=[0.5,0.25,0],sgd="adam",epochs=150)
# AE H2 MLP OUT
print "asr h2 MLP OUT"
nbrs = NearestNeighbors(n_neighbors=1, algorithm='auto').fit(corps["TRS_SPELIKE_H2_RELU"]["TRAIN"])
TEST_distances, TEST_indices = nbrs.kneighbors(corps["ASR_SPELIKE_H2_RELU"]["TEST"])
DEV_distances, DEV_indices = nbrs.kneighbors(corps["ASR_SPELIKE_H2_RELU"]["DEV"])
x_test=numpy.array([TRS_sparse["TRAIN"][x].toarray()[0] for x in TEST_indices])
x_dev=numpy.array([TRS_sparse["TRAIN"][x].toarray()[0] for x in DEV_indices])
scores["ASR_AE_H2_MLP_OUT"]=mlp.train_mlp(TRS_sparse["TRAIN"].todense(),sparse_corp["LABEL"]["TRAIN"],x_dev,sparse_corp["LABEL"]["DEV"],x_test,sparse_corp["LABEL"]["TEST"],[1024,512,1024],dropouts=[0.5,0.25,0],sgd="adam",epochs=150)
# AE HZ MLP H2
print "asr h2 MLP h2"
x_test=numpy.array([corps["TRS_SPELIKE_H2_RELU"]["TRAIN"][x][0] for x in TEST_indices])
x_dev=numpy.array([corps["TRS_SPELIKE_H2_RELU"]["TRAIN"][x][0] for x in DEV_indices])
scores["ASR_AE_H2_MLP_H2"]=mlp.train_mlp(corps["TRS_SPELIKE_H2_RELU"]["TRAIN"],sparse_corp["LABEL"]["TRAIN"],x_dev,sparse_corp["LABEL"]["DEV"],x_test,sparse_corp["LABEL"]["TEST"],[1024,512,1024],dropouts=[0.5,0.25,0],sgd="adam",epochs=150)