04a-mlp.py 3.28 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105


# coding: utf-8

# In[29]:

# Import
import itertools
import shelve
import pickle
import numpy
import scipy
from scipy import sparse
import scipy.sparse
import scipy.io
from mlp import *
import mlp
import sys
import utils
import dill
from collections import Counter
from gensim.models import LdaModel


# In[3]:

#30_50_50_150_0.0001

# In[4]:

#db=shelve.open("SPELIKE_MLP_DB.shelve",writeback=True)
in_dir = sys.argv[1]
origin_corps = shelve.open(sys.argv[2])

## ['vocab',
#'ASR_AE_OUT_RELU',
#'ASR_AE_H2_RELU',
#'ASR_H1_TRANSFORMED_W2_RELU',
#'ASR_AE_H1_RELU',
#'ASR_H1_TRANFORMED_OUT_RELU',
#'ASR_H1_TRANFORMED_TRSH2_RELU',
#'TRS_AE_H2_RELU',
#'ASR_H2_TRANSFORMED_W1_RELU',
#'ASR_H2_TRANSFORMED_W2_RELU',
#'TRS_AE_H1_RELU',
#'ASR_H2_TRANFORMED_OUT_RELU',
#'ASR_SPARSE',
#'ASR_H2_TRANFORMED_TRSH2_RELU',
#'ASR_H1_TRANSFORMED_W1_RELU',
#'TRS_AE_OUT_RELU']
##
#
# [ 'vocab', 'LABEL', 'TRS_SPARSE', 'ASR_SPARSE'] 

out_db=shelve.open("{}/mlp_scores.shelve".format(in_dir),writeback=True)

infer_db=shelve.open("{}/infer.shelve".format(in_dir),writeback=True)
#lb=LabelBinarizer()
#y_train=lb.fit_transform([utils.select(ligneid) for ligneid in origin_corps["LABEL"]["TRAIN"]])
#y_dev=lb.transform([utils.select(ligneid) for ligneid in origin_corps["LABEL"]["DEV"]])
#y_test=lb.transform([utils.select(ligneid) for ligneid in origin_corps["LABEL"]["TEST"]])


y_train=origin_corps["LABEL"]["TRAIN"]
y_dev= origin_corps["LABEL"]["DEV"]
y_test=origin_corps["LABEL"]["TEST"]

sw =dill.load(open("{}/stopwords.dill".format(in_dir))) # stop words
LDAs={}
LDAs["ASR"] = LdaModel.load("{}/lda_asr.model".format(in_dir))
LDAs["TRS"] = LdaModel.load("{}/lda_trs.model".format(in_dir))

data = {"RAW":{"ASR":{},"TRS":{}},"LDA":{"ASR":{},"TRS":{}}}
data["RAW"]["ASR"]["TRAIN"] =  [[ (x,y) for x,y in Counter(z).items() if x not in sw ] for z in origin_corps["ASR_wid"]["TRAIN"] ] 
data["RAW"]["ASR"]["DEV"] =  [[ (x,y) for x,y in Counter(z).items() if x not in sw ] for z in origin_corps["ASR_wid"]["DEV"] ] 
data["RAW"]["ASR"]["TEST"] =  [[ (x,y) for x,y in Counter(z).items() if x not in sw ] for z in origin_corps["ASR_wid"]["TEST"] ] 


data["RAW"]["TRS"]["TRAIN"] =  [[ (x,y) for x,y in Counter(z).items() if x not in sw ] for z in origin_corps["TRS_wid"]["TRAIN"] ] 
data["RAW"]["TRS"]["DEV"] =  [[ (x,y) for x,y in Counter(z).items() if x not in sw ] for z in origin_corps["TRS_wid"]["DEV"] ] 
data["RAW"]["TRS"]["TEST"] =  [[ (x,y) for x,y in Counter(z).items() if x not in sw ] for z in origin_corps["TRS_wid"]["TEST"] ] 

nb_epochs=500
for key in ["TRS", "ASR"] :
    for  corp_key in data["RAW"][key].keys():
        data["LDA"][key][corp_key]= \
        LDAs[key].inference(
                            data["RAW"][key][corp_key])[0]

    res=mlp.train_mlp(data["LDA"][key]["TRAIN"],y_train,data["LDA"][key]["DEV"],y_dev,data["LDA"][key]["TEST"],y_test,[40,25,40],dropouts=[0,0,0,0],sgd=Adam(lr=0.0001),epochs=nb_epochs,batch_size=8,save_pred=False,keep_histo=False,loss="categorical_crossentropy",fit_verbose=0)
    arg_best=numpy.argmax(res[1])
    dev_best = res[1][arg_best]
    test_best = res[2][arg_best]
    out_db[key]=(res,(dev_best,test_best))
    print in_dir,dev_best,test_best
    

for k,v in data.items():
    infer_db[k] = v 

for key in out_db.keys():
    print key,out_db[key][1]
out_db.close()
infer_db.close()
origin_corps.close()