run-measures.sh
4.45 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# Pour le moment, le run ne fait qu'executer
# quelques petites commandes que l'on souhaite
# tester.
OUTDIR="exp/kmeans_euclidian/teacher-pvector-1"
EXP_DIR=${OUTDIR}
DATADIR="data"
NEW_LSTDIR="${OUTDIR}/lst"
kmin=2
kmax=100
if [ ! -d "$OUTDIR" ];
then
mkdir -p $OUTDIR
fi
if [ ! -d "$NEW_LSTDIR" ];
then
mkdir -p $NEW_LSTDIR
fi
for kfold in {1..4}
do
pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt"
VECTOR_FILE=$pvector_file
lst_dir="${DATADIR}/pvectors_1rst/lst"
output_kfold="${OUTDIR}/${kfold}"
#python3 "bin/replace_label.py" \
# "${DATADIR}/masseffect.lst" \
# "${DATADIR}/character_information.csv" \
# --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \
# --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst"
#python3 "bin/replace_label.py" \
# "${DATADIR}/masseffect.lst" \
# "${DATADIR}/character_information.csv" \
# --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \
# --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst"
#cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst"
TRAIN_LST=${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst
VAL_LST=${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst
TRAIN_LANG_LST=${NEW_LSTDIR}/train_${kfold}_lang.lst
VAL_LANG_LST=${NEW_LSTDIR}/val_${kfold}_lang.lst
METAS_LANG=${NEW_LSTDIR}/metas_${kfold}_lang.lst
# EXTRACT LANGUAGE INFORMATION
awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST}
echo "VAL EXTRACT LANGUAGE INFO DONE"
awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST}
echo "TRAIN EXTRACT LANGUAGE INFO DONE"
cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}"
echo "GLOBAL EXTRACT LANGUAGE INFO DONE"
echo "Clustering - ${kfold}"
for k in $(seq ${kmin} 1 ${kmax})
do
echo "Kmeans Measuring and ploting - ${k}"
SUB_EXP_DIR="${EXP_DIR}/${kfold}/${k}"
# -- EXTRACT CLUSTERING LABELS
python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \
"${VECTOR_FILE}" \
--outfile "${SUB_EXP_DIR}/clustered_${k}.txt"
# -- MEASURES AND PLOT WITH RESPECT TO LANG VAR
# Measures
python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
"${METAS_LANG}" \
"${TRAIN_LST}" \
"${VAL_LST}" \
--outfile "${SUB_EXP_DIR}/measures_lang.json"
# This script plot the count matrix of the train set
python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
"${METAS_LANG}" \
"${TRAIN_LST}" \
--outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf"
# This script plot the count matrix of the validation set
python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
"${METAS_LANG}" \
"${VAL_LST}" \
--outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf"
rm ${SUB_EXP_DIR}/clustered_${k}.txt
#python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \
# "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \
# "${lst_dir}/val_${kfold}.lst" \
# --outfile "${output_kfold}/${k}/measures_type.json"
# This script plot the count matrix of the train set
#python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
# ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \
# --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf
# This script plot the count matrix of the validation set
#python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
# ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \
# --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf
# This script plot the count matrix of the train set
#python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
# ${pvector_file} ${lst_dir}/train_${kfold}.lst \
# --outfile ${output_kfold}/${k}/train_count_matrix.pdf
# This script plot the count matrix of the validation set
#python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
# ${pvector_file} ${lst_dir}/val_${kfold}.lst \
# --outfile ${output_kfold}/${k}/val_count_matrix.pdf
done
done