00-mmf_make_features.py
1.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import sys
import os
import pandas
import numpy
import shelve
from sklearn.preprocessing import LabelBinarizer
from utils import select_mmf as select
input_dir = sys.argv[1] # Dossier de premire niveau contient ASR et TRS
level = sys.argv[2] # taille de LDA ( -5) voulu
output_dir = sys.argv[3]
lb=LabelBinarizer()
#y_train=lb.fit_transform([utils.select(ligneid) for ligneid in origin_corps["LABEL"]["TRAIN"]])
data = shelve.open("{}/mmf_{}.shelve".format(output_dir,level),writeback=True)
data["LABEL"]= {}
data["LDA"] = {"ASR":{},"TRS":{}}
for mod in ["ASR", "TRS" ]:
train = pandas.read_table("{}/{}/train_{}.tab".format(input_dir, mod, level), sep=" ", header=None )
dev = pandas.read_table("{}/{}/dev_{}.tab".format(input_dir, mod, level), sep=" ", header=None )
test = pandas.read_table("{}/{}/test_{}.tab".format(input_dir, mod, level), sep=" ", header=None )
y_train = train.iloc[:,0].apply(select)
y_dev = dev.iloc[:,0].apply(select)
y_test = test.iloc[:,0].apply(select)
lb.fit(y_train)
data["LABEL"][mod]={"TRAIN":lb.transform(y_train),"DEV":lb.transform(y_dev), "TEST": lb.transform(y_test)}
# data["LDA"][mod]={'ASR':[]}
print train.values
data["LDA"][mod]["TRAIN"]=train.iloc[:,1:-1].values
data["LDA"][mod]["DEV"]=dev.iloc[:,1:-1].values
data["LDA"][mod]["TEST"]=test.iloc[:,1:-1].values
print data["LDA"][mod]["TRAIN"].shape
data.sync()
data.close()