run.sh
3.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
#OUTDIR="exp/test/pvector-2"
#DATADIR="data"
#NEW_LSTDIR="${OUTDIR}/lst"
#VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher"
#VECTOR_FILES_END=".txt"
#VECTOR_FILE="" # To specify if there's only one
#VECTOR_FILES_ONE=false # Specify there's only one file
#KMIN=2
#KMAX=100
# -- LOAD CONFIG FILE
CONFIG_FILE="config.sh"
if [ $# -eq 1 ]
then
CONFIG_FILE="$1"
else
echo "Need to have one and only one argument"
exit -1
fi
source $CONFIG_FILE
# -- DEFAULTS VALUES CONFIGURATION
if [ -z "$VECTOR_FILES_ONE" ]
then
VECTOR_FILES_ONE=false
fi
# -- MAKE DIRECTORIES
if [ ! -d "$OUTDIR" ];
then
mkdir -p $OUTDIR
fi
if [ ! -d "${NEW_LSTDIR}" ];
then
mkdir -p ${NEW_LSTDIR}
fi
# -- KFOLD MIN and MAX
if [ -z "$MIN_KFOLD" ]
then
MIN_KFOLD=1
fi
if [ -z "$MAX_KFOLD" ]
then
MAX_KFOLD=4
fi
# -- BEGIN BY KFOLD
for kfold in $(seq ${MIN_KFOLD} ${MAX_KFOLD})
do
# Some usefull variable
CHAR_INFO="${DATADIR}/character_information.csv"
TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst"
VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst"
TRAIN_LANG_LST="${NEW_LSTDIR}/train_${kfold}_lang.lst"
VAL_LANG_LST="${NEW_LSTDIR}/val_${kfold}_lang.lst"
# Configuration for the run clustering file
if [ ${VECTOR_FILES_ONE} == false ]
then
VECTOR_FILE="${VECTOR_FILES_BEGIN}_${kfold}${VECTOR_FILES_END}"
fi
TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst"
VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst"
EXP_DIR="${OUTDIR}/${kfold}"
METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst"
METAS_CHARACTER="${DATADIR}/masseffect.lst"
METAS_LANG="${NEW_LSTDIR}/metas_${kfold}_lang.lst"
if [ ! -d "${EXP_DIR}" ];
then
mkdir -p ${EXP_DIR}
fi
# EXTRACT TYPE INFORMATION
echo "Extracting character information"
echo "Replace in train"
python3 "bin/replace_label.py" \
"${METAS_CHARACTER}" \
"${CHAR_INFO}" \
--field "type" \
--lst "${TRAIN_LST}" \
--outfile "${TRAIN_TYPE_LST}"
echo "Replace in val"
python3 "bin/replace_label.py" \
"${METAS_CHARACTER}" \
"${CHAR_INFO}" \
--field "type" \
--lst "${VAL_LST}" \
--outfile "${VAL_TYPE_LST}"
echo "Merge them"
cat "${TRAIN_TYPE_LST}" "${VAL_TYPE_LST}" > "${METAS_TYPE}"
# EXTRACT LANGUAGE INFORMATION
echo "Language info for train"
awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST}
echo "Language info for val"
awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST}
echo "Merge them"
cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}"
echo "Then Run Clustering"
source "run-clustering.sh"
done
# Regroup measures with respect to character classes
echo "Regrouping measures with respect to character classes"
python3 "bin/regroup-measures.py" ${OUTDIR}
# Regroup measures with respect to type classes
echo "Regrouping measures with respect to type classes"
python3 "bin/regroup-measures.py" ${OUTDIR} --suffix "_type" --measurefile "measures_type.json"