04f-pca.py 2.38 KB
# coding: utf-8

# In[29]:

# Import
import itertools
import shelve
import pickle
import numpy
import scipy
from scipy import sparse
import scipy.sparse
import scipy.io
from mlp import *
import mlp
import sys
import utils
import dill
from collections import Counter
from gensim.models import LdaModel
from sklearn.decomposition import PCA



# In[3]:

#30_50_50_150_0.0001

# In[4]:

#db=shelve.open("SPELIKE_MLP_DB.shelve",writeback=True)
origin_corps=shelve.open("{}".format(sys.argv[2]))
in_dir = sys.argv[1]
if len(sys.argv) > 3 :
    features_key = sys.argv[3]
else :
    features_key = "LDA"

out_db=shelve.open("{}/pca_scores.shelve".format(in_dir),writeback=True)
mlp_h = [ 250, 250  ]
mlp_loss = "categorical_crossentropy"
mlp_dropouts = [0.25]* len(mlp_h)
mlp_sgd = Adam(lr=0.0001)
mlp_epochs = 3000
mlp_batch_size = 5
mlp_input_activation = "relu"
mlp_output_activation="softmax"

ress = []
print 

for key in origin_corps[features_key].keys() :
    print "#########" + key + "########"
    dev_best =[]
    test_best = []
    test_max = []
    pca = PCA(n_components=200, copy=True, whiten=True)
    x_train_big = pca.fit_transform(origin_corps[features_key][key]["TRAIN"])
    y_train =origin_corps["LABEL"][key]["TRAIN"]



    x_dev_big = pca.transform(origin_corps[features_key][key]["DEV"])
    y_dev = origin_corps["LABEL"][key]["DEV"]

    x_test_big = pca.transform(origin_corps[features_key][key]["TEST"])
    y_test = origin_corps["LABEL"][key]["TEST"]
    for i in range(1,200):
        x_train = x_train_big[:,:i]
        x_dev =  x_dev_big[:,:i]
        x_test = x_test_big[:,:i]
        print "xshape",x_train.shape
        print "xdev", x_dev.shape
        print "xtest",x_test.shape
        res=mlp.train_mlp(x_train,y_train,
            x_dev,y_dev,
            x_test ,y_test,
            mlp_h,dropouts=mlp_dropouts,sgd=mlp_sgd,
            epochs=mlp_epochs,
            batch_size=mlp_batch_size,
            save_pred=False,keep_histo=False,
            loss="categorical_crossentropy",fit_verbose=0)
        arg_best = numpy.argmax(res[1])
        dev_best.append(res[1][arg_best])
        test_best.append(res[2][arg_best])
        test_max.append(numpy.max(res[2]))
        print dev_best[-1],test_best[-1]
    out_db[key]=(res,(dev_best,test_best,test_max))
    ress.append((key,dev_best,test_best,test_max))
    out_db.sync()

for el in ress :
    print el
out_db.close()
origin_corps.close()