01a-mlp_proj.py 3.86 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119


# coding: utf-8

# In[2]:

# Import
import gensim
from scipy import sparse
import itertools
from sklearn import preprocessing
from keras.models import Sequential
from keras.optimizers import SGD,Adam
from keras.layers.advanced_activations import ELU,PReLU
from keras.callbacks import ModelCheckpoint
from mlp import *
import sklearn.metrics
from sklearn.preprocessing import LabelBinarizer
import shelve
import pickle
from utils import *
import sys
import os
import json
# In[4]:

infer_model=shelve.open("{}".format(sys.argv[2]))
in_dir = sys.argv[1]
#['ASR', 'TRS', 'LABEL']
# In[6]:
if len(sys.argv) > 4 :
    features_key = sys.argv[4]
else :
    features_key = "LDA"
save_projection = True
json_conf =json.load(open(sys.argv[3]))
ae_conf = json_conf["mlp_proj"]

hidden_size= ae_conf["hidden_size"]
input_activation = None
if ae_conf["input_activation"] == "elu":
    print " ELU"
    input_activation = PReLU()
else:
    print " ELSE"
    input_activation = ae_conf["input_activation"]
#input_activation=ae_conf["input_activation"]
output_activation=ae_conf["output_activation"]
loss=ae_conf["loss"]
epochs=ae_conf["epochs"]
batch_size=ae_conf["batch"]
patience=ae_conf["patience"]
dropouts=ae_conf["do"]
try:
    k = ae_conf["sgd"]
    if ae_conf["sgd"]["name"] == "adam":
        sgd = Adam(lr=ae_conf["sgd"]["lr"])#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True)
    elif ae_conf["sgd"]["name"] == "sgd":
        sgd = SGD(lr=ae_conf["sgd"]["lr"])
except: 
    sgd = ae_conf["sgd"]

mlp_conf = json_conf["mlp"]
mlp_h = mlp_conf["hidden_size"]
mlp_loss = mlp_conf["loss"]
mlp_dropouts = mlp_conf["do"]
mlp_epochs = mlp_conf["epochs"]
mlp_batch_size = mlp_conf["batch"]
mlp_input_activation=mlp_conf["input_activation"]
mlp_output_activation=mlp_conf["output_activation"]

try:
    k = mlp_conf["sgd"]
    if mlp_conf["sgd"]["name"] == "adam":
        mlp_sgd = Adam(lr=mlp_conf["sgd"]["lr"])#SGD(lr=0.00001,nesterov=False) #'rmsprop'# Adam(lr=0.00001)#SGD(lr=0.001, momentum=0.9, nesterov=True)
    elif mlp_conf["sgd"]["name"] == "sgd":
        mlp_sgd = SGD(lr=mlp_conf["sgd"]["lr"])
except: 
    mlp_sgd = mlp_conf["sgd"]


name = json_conf["name"]
try :
    os.mkdir("{}/{}".format(in_dir,name))
except OSError :
    pass
db = shelve.open("{}/{}/labels.shelve".format(in_dir,name))
db["IDS"]=dict(infer_model["LABEL"])
#
keys = infer_model[features_key].keys()
LABELS = {}
for mod in keys : 

    int_labels_train = map(select,infer_model["LABEL"][mod]["TRAIN"])
    binarizer = LabelBinarizer()
    y_train=binarizer.fit_transform(int_labels_train)
    y_dev=binarizer.transform(map(select,infer_model["LABEL"][mod]["DEV"]))
    y_test=binarizer.transform(map(select,infer_model["LABEL"][mod]["TEST"]))
    LABELS[mod]= { "TRAIN":y_train , "DEV" : y_dev, "TEST" : y_test}
    sumary,proj = train_mlp_proj(infer_model[features_key][mod]["TRAIN"].todense(),y_train,
                            infer_model[features_key][mod]["DEV"].todense(),y_dev,
                            infer_model[features_key][mod]["TEST"].todense(),y_test,
                            hidden_size ,sgd=sgd,
                            epochs=epochs,
                            patience=patience,
                            batch_size=batch_size,
                            input_activation=input_activation,
                            output_activation=output_activation,
                            dropouts=dropouts,
                            fit_verbose=1)
    with  open("{}/{}/{}_sum.txt".format(in_dir,name,mod),"w") as output_sum :
        print >>output_sum, sumary
    for num_lvl,level in enumerate(proj):
        print len(level)
        for num,corp_type in enumerate(["TRAIN","DEV","TEST"]):
            pd = pandas.DataFrame(level[num])
            pd.to_hdf("{}/{}/MLP_proj_df.hdf".format(in_dir,name),"{}/lvl{}/{}".format(mod,num_lvl,corp_type))
db["LABEL"] = LABELS
db.sync()
db.close()