Commit 95142dfdc54218f17529b6757ed7f310811b9534

Authored by Mathias Quillot
1 parent 0ab563604a
Exists in master

maj. No comment

Showing 7 changed files with 135 additions and 29 deletions Inline Diff

bin/replace_label_lst.py
File was created 1
2 import argparse
3
4 parser = argparse.ArgumentParser(description="extract label from lst file, move a label in fact")
5
6
config/pv_from_xv_config.sh
File was created 1
2 # Framework configuration
3 OUTDIR="exp/kmeans_euclidian/pv_from_xv"
4 DATADIR="data"
5 NEW_LSTDIR="${OUTDIR}/lst"
6
7 VECTOR_FILES_BEGIN="${DATADIR}/pv_from_xv/me_pv_teacher"
8 VECTOR_FILES_END=".txt"
9 VECTOR_FILE="" # To specify if there's only one
10 VECTOR_FILES_ONE=false # Specify there's only one file
11
12 KMIN=2
13 KMAX=100
14
config/pvector_config.sh
1
1 OUTDIR="exp/kmeans_euclidian/teacher-pvector-1" 2 OUTDIR="exp/kmeans_euclidian/teacher-pvector-1"
2 DATADIR="data" 3 DATADIR="data"
3 NEW_LSTDIR="${OUTDIR}/lst" 4 NEW_LSTDIR="${OUTDIR}/lst"
4 5
5 VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" 6 VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher"
6 VECTOR_FILES_END=".txt" 7 VECTOR_FILES_END=".txt"
7 VECTOR_FILE="" # To specify if there's only one 8 VECTOR_FILE="" # To specify if there's only one
8 VECTOR_FILES_ONE=false # Specify there's only one file 9 VECTOR_FILES_ONE=false # Specify there's only one file
9 10
10 KMIN=2 11 KMIN=2
11 KMAX=100 12 KMAX=100
12 13
1 1
2 2
3 # Number of set 3 # Number of set
4 k=4 4 k=4
5 kmean=88
5 6
7
6 # Vector features file 8 # Vector features file
7 VECTOR_FILE_MASSEFFECT="data/pvectors_1rst/pvectors_teacher_${k}.txt" 9 VECTOR_FILE_MASSEFFECT="data/xvectors.txt"
8 10
9 # Number of clusters
10 kmean=6
11 11
12 # Dirs 12 # Dirs
13 EXP_DIR="exp/kmeans_euclidian/teacher-pvector-1/${k}/${kmean}" 13 EXP_DIR="exp/kmeans_euclidian/xvectors/${k}/${kmean}"
14 CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl" 14 CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl"
15 15
16 16
17 # Output dirs 17 # Output dirs
18 OUTFILE_MASSEFFECT="data/pvectors_1rst/saved_clustered/masseffect_clustered_${k}_${kmean}.txt" 18 OUTFILE_MASSEFFECT="data/xvectors/saved_clustered/masseffect_clustered_xvectors_${k}_${kmean}.txt"
19 19
20 python3 bin/extract_kmeans.py "${CLUSTERING}" \ 20 python3 bin/extract_kmeans.py "${CLUSTERING}" \
21 "${VECTOR_FILE_MASSEFFECT}" \ 21 "${VECTOR_FILE_MASSEFFECT}" \
1 # 1 #
2 # This script aims to compute clustering 2 # This script aims to compute clustering
3 # 3 #
4 4
5 5
6 # -- CONFIGURATION 6 # -- CONFIGURATION
7 # THIS SCRIPT NEEDS THESE VARIABLES 7 # THIS SCRIPT NEEDS THESE VARIABLES
8 # Vector file 8 # Vector file
9 #VECTOR_FILE="" 9 #VECTOR_FILE=""
10 # Train list 10 # Train list
11 #TRAIN_LST=="" 11 #TRAIN_LST==""
12 # Val list 12 # Val list
13 #VAL_LST="" 13 #VAL_LST=""
14 # Exp directory 14 # Exp directory
15 #EXP_DIR="" 15 #EXP_DIR=""
16 # Metas file with type values 16 # Metas file with type values
17 #METAS_TYPE="" 17 #METAS_TYPE=""
18 # Metas file with character values 18 # Metas file with character values
19 #METAS_CHARACTER="" 19 #METAS_CHARACTER=""
20 20
21 21
22 #echo "VECTOR FILE: $VECTOR_FILE" 22 #echo "VECTOR FILE: $VECTOR_FILE"
23 #echo "TRAIN LIST: $TRAIN_LST" 23 #echo "TRAIN LIST: $TRAIN_LST"
24 #echo "VAL LIST: $VAL_LST" 24 #echo "VAL LIST: $VAL_LST"
25 #echo "EXP DIR: $EXP_DIR" 25 #echo "EXP DIR: $EXP_DIR"
26 #echo "METAS TYPE: $METAS_TYPE" 26 #echo "METAS TYPE: $METAS_TYPE"
27 #echo "METAS_CHARACTER: $METAS_CHARACTER" 27 #echo "METAS_CHARACTER: $METAS_CHARACTER"
28 28
29 29
30 30
31 # -- TRAIN KMEANS 31 # -- TRAIN KMEANS
32 echo "Clustering - ${kfold}" 32 echo "Clustering - ${kfold}"
33 python3 bin/cluster_kmeans.py "${VECTOR_FILE}" \ 33 python3 bin/cluster_kmeans.py "${VECTOR_FILE}" \
34 "${TRAIN_LST}" \ 34 "${TRAIN_LST}" \
35 "${EXP_DIR}" --kmin ${KMIN} --kmax ${KMAX} 35 "${EXP_DIR}" --kmin ${KMIN} --kmax ${KMAX}
36 36
37 37
38 38
39 for k in $(seq ${KMIN} 1 ${KMAX}) 39 for k in $(seq ${KMIN} 1 ${KMAX})
40 do 40 do
41 SUB_EXP_DIR="${EXP_DIR}/${k}" 41 SUB_EXP_DIR="${EXP_DIR}/${k}"
42 42
43 # -- EXTRACT KMEANS VALUES 43 # -- EXTRACT KMEANS VALUES
44 echo "Kmeans Measuring and extraction - ${k}" 44 echo "Kmeans Measuring and extraction - ${k}"
45 python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ 45 python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \
46 "${VECTOR_FILE}" \ 46 "${VECTOR_FILE}" \
47 --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" 47 --outfile "${SUB_EXP_DIR}/clustered_${k}.txt"
48 # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR 48 # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR
49 # Measures 49 # Measures
50 python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ 50 python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
51 "${METAS_CHARACTER}" \ 51 "${METAS_CHARACTER}" \
52 "${TRAIN_LST}" \ 52 "${TRAIN_LST}" \
53 "${VAL_LST}" \ 53 "${VAL_LST}" \
54 --outfile "${SUB_EXP_DIR}/measures.json" 54 --outfile "${SUB_EXP_DIR}/measures.json"
55 55
56 # Plot count matrix for train 56 # Plot count matrix for train
57 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ 57 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
58 ${VECTOR_FILE} \ 58 ${VECTOR_FILE} \
59 ${TRAIN_LST} \ 59 ${TRAIN_LST} \
60 --outfile "${SUB_EXP_DIR}/train_count_matrix.pdf" 60 --outfile "${SUB_EXP_DIR}/train_count_matrix.pdf"
61 61
62 # Plot count matrix for val 62 # Plot count matrix for val
63 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ 63 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
64 ${VECTOR_FILE} \ 64 ${VECTOR_FILE} \
65 ${VAL_LST} \ 65 ${VAL_LST} \
66 --outfile "${SUB_EXP_DIR}/val_count_matrix.pdf" 66 --outfile "${SUB_EXP_DIR}/val_count_matrix.pdf"
67 67
68 # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR 68 # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR
69 # Measures 69 # Measures
70 python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ 70 python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
71 "${METAS_TYPE}" \ 71 "${METAS_TYPE}" \
72 "${TRAIN_LST}" \ 72 "${TRAIN_LST}" \
73 "${VAL_LST}" \ 73 "${VAL_LST}" \
74 --outfile "${SUB_EXP_DIR}/measures_type.json" 74 --outfile "${SUB_EXP_DIR}/measures_type.json"
75 75
76 # This script plot the count matrix of the train set 76 # This script plot the count matrix of the train set
77 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ 77 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
78 "${METAS_TYPE}" \ 78 "${METAS_TYPE}" \
79 "${TRAIN_LST}" \ 79 "${TRAIN_LST}" \
80 --outfile "${SUB_EXP_DIR}/train_count_matrix_type.pdf" 80 --outfile "${SUB_EXP_DIR}/train_count_matrix_type.pdf"
81 81
82 # This script plot the count matrix of the validation set 82 # This script plot the count matrix of the validation set
83 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ 83 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
84 "${METAS_TYPE}" \ 84 "${METAS_TYPE}" \
85 "${VAL_LST}" \ 85 "${VAL_LST}" \
86 --outfile "${SUB_EXP_DIR}/val_count_matrix_type.pdf" 86 --outfile "${SUB_EXP_DIR}/val_count_matrix_type.pdf"
87 87
88
89 # -- MEASURES AND PLOT WITH RESPECT TO LANG VAR
90 # Measures
91 python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
92 "${METAS_LANG}" \
93 "${TRAIN_LST}" \
94 "${VAL_LST}" \
95 --outfile "${SUB_EXP_DIR}/measures_lang.json"
96
97 # This script plot the count matrix of the train set
98 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
99 "${METAS_LANG}" \
100 "${TRAIN_LST}" \
101 --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf"
102
103 # This script plot the count matrix of the validation set
104 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
105 "${METAS_LANG}" \
106 "${VAL_LST}" \
107 --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf"
108
88 done 109 done
89 110
90 111
1 # Pour le moment, le run ne fait qu'executer 1 # Pour le moment, le run ne fait qu'executer
2 # quelques petites commandes que l'on souhaite 2 # quelques petites commandes que l'on souhaite
3 # tester. 3 # tester.
4 4
5 OUTDIR="exp/kmeans_teacher_1/pvector-1" 5 OUTDIR="exp/kmeans_euclidian/teacher-pvector-1"
6 EXP_DIR=${OUTDIR}
6 DATADIR="data" 7 DATADIR="data"
7 NEW_LSTDIR="${OUTDIR}/lst" 8 NEW_LSTDIR="${OUTDIR}/lst"
8 9
9 kmin=2 10 kmin=2
10 kmax=100 11 kmax=100
11 12
12 if [ ! -d "$OUTDIR" ]; 13 if [ ! -d "$OUTDIR" ];
13 then 14 then
14 mkdir -p $OUTDIR 15 mkdir -p $OUTDIR
15 fi 16 fi
16 17
17 if [ ! -d "$NEW_LSTDIR" ]; 18 if [ ! -d "$NEW_LSTDIR" ];
18 then 19 then
19 mkdir -p $NEW_LSTDIR 20 mkdir -p $NEW_LSTDIR
20 fi 21 fi
21 22
22 for kfold in {1..4} 23 for kfold in {1..4}
23 do 24 do
24 pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" 25 pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt"
26 VECTOR_FILE=$pvector_file
25 lst_dir="${DATADIR}/pvectors_1rst/lst" 27 lst_dir="${DATADIR}/pvectors_1rst/lst"
26 output_kfold="${OUTDIR}/${kfold}" 28 output_kfold="${OUTDIR}/${kfold}"
27 29
28 #python3 "bin/replace_label.py" \ 30 #python3 "bin/replace_label.py" \
29 # "${DATADIR}/masseffect.lst" \ 31 # "${DATADIR}/masseffect.lst" \
30 # "${DATADIR}/character_information.csv" \ 32 # "${DATADIR}/character_information.csv" \
31 # --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \ 33 # --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \
32 # --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst" 34 # --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst"
33 35
34 #python3 "bin/replace_label.py" \ 36 #python3 "bin/replace_label.py" \
35 # "${DATADIR}/masseffect.lst" \ 37 # "${DATADIR}/masseffect.lst" \
36 # "${DATADIR}/character_information.csv" \ 38 # "${DATADIR}/character_information.csv" \
37 # --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \ 39 # --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \
38 # --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst" 40 # --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst"
39 41
40 #cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst" 42 #cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst"
43 TRAIN_LST=${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst
44 VAL_LST=${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst
45 TRAIN_LANG_LST=${NEW_LSTDIR}/train_${kfold}_lang.lst
46 VAL_LANG_LST=${NEW_LSTDIR}/val_${kfold}_lang.lst
47 METAS_LANG=${NEW_LSTDIR}/metas_${kfold}_lang.lst
48
49 # EXTRACT LANGUAGE INFORMATION
50 awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST}
51 echo "VAL EXTRACT LANGUAGE INFO DONE"
52 awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST}
53 echo "TRAIN EXTRACT LANGUAGE INFO DONE"
54 cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}"
55 echo "GLOBAL EXTRACT LANGUAGE INFO DONE"
56
41 57
42
43 echo "Clustering - ${kfold}" 58 echo "Clustering - ${kfold}"
44 59
45 for k in $(seq ${kmin} 1 ${kmax}) 60 for k in $(seq ${kmin} 1 ${kmax})
46 do 61 do
47 echo "Kmeans Measuring and ploting - ${k}" 62 echo "Kmeans Measuring and ploting - ${k}"
48 63
49 # This script compute measures from clustering 64 SUB_EXP_DIR="${EXP_DIR}/${kfold}/${k}"
50 #python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json" 65
51 66 # -- EXTRACT CLUSTERING LABELS
67 python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \
68 "${VECTOR_FILE}" \
69 --outfile "${SUB_EXP_DIR}/clustered_${k}.txt"
70
71 # -- MEASURES AND PLOT WITH RESPECT TO LANG VAR
72 # Measures
73 python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
74 "${METAS_LANG}" \
75 "${TRAIN_LST}" \
76 "${VAL_LST}" \
77 --outfile "${SUB_EXP_DIR}/measures_lang.json"
78
79 # This script plot the count matrix of the train set
80 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
81 "${METAS_LANG}" \
82 "${TRAIN_LST}" \
83 --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf"
84
85 # This script plot the count matrix of the validation set
86 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
87 "${METAS_LANG}" \
88 "${VAL_LST}" \
89 --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf"
90
91 rm ${SUB_EXP_DIR}/clustered_${k}.txt
52 #python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \ 92 #python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \
53 # "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \ 93 # "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \
54 # "${lst_dir}/val_${kfold}.lst" \ 94 # "${lst_dir}/val_${kfold}.lst" \
55 # --outfile "${output_kfold}/${k}/measures_type.json" 95 # --outfile "${output_kfold}/${k}/measures_type.json"
56 96
57 # This script plot the count matrix of the train set 97 # This script plot the count matrix of the train set
58 python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ 98 #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
59 ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \ 99 # ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \
60 --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf 100 # --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf
61 101
62 # This script plot the count matrix of the validation set 102 # This script plot the count matrix of the validation set
63 python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ 103 #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
64 ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \ 104 # ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \
65 --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf 105 # --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf
66 106
67 # This script plot the count matrix of the train set 107 # This script plot the count matrix of the train set
68 python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ 108 #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
69 ${pvector_file} ${lst_dir}/train_${kfold}.lst \ 109 # ${pvector_file} ${lst_dir}/train_${kfold}.lst \
70 --outfile ${output_kfold}/${k}/train_count_matrix.pdf 110 # --outfile ${output_kfold}/${k}/train_count_matrix.pdf
71 111
72 # This script plot the count matrix of the validation set 112 # This script plot the count matrix of the validation set
73 python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ 113 #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
74 ${pvector_file} ${lst_dir}/val_${kfold}.lst \ 114 # ${pvector_file} ${lst_dir}/val_${kfold}.lst \
75 --outfile ${output_kfold}/${k}/val_count_matrix.pdf 115 # --outfile ${output_kfold}/${k}/val_count_matrix.pdf
76 done 116 done
77 done 117 done
78 118
1 1
2 #OUTDIR="exp/test/pvector-2" 2 #OUTDIR="exp/test/pvector-2"
3 #DATADIR="data" 3 #DATADIR="data"
4 #NEW_LSTDIR="${OUTDIR}/lst" 4 #NEW_LSTDIR="${OUTDIR}/lst"
5 5
6 #VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" 6 #VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher"
7 #VECTOR_FILES_END=".txt" 7 #VECTOR_FILES_END=".txt"
8 #VECTOR_FILE="" # To specify if there's only one 8 #VECTOR_FILE="" # To specify if there's only one
9 #VECTOR_FILES_ONE=false # Specify there's only one file 9 #VECTOR_FILES_ONE=false # Specify there's only one file
10 10
11 #KMIN=2 11 #KMIN=2
12 #KMAX=100 12 #KMAX=100
13 13
14 # -- LOAD CONFIG FILE 14 # -- LOAD CONFIG FILE
15 CONFIG_FILE="config.sh" 15 CONFIG_FILE="config.sh"
16 16
17 if [ $# -eq 1 ] 17 if [ $# -eq 1 ]
18 then 18 then
19 CONFIG_FILE="$1" 19 CONFIG_FILE="$1"
20 else 20 else
21 echo "Need to have one and only one argument" 21 echo "Need to have one and only one argument"
22 exit -1 22 exit -1
23 fi 23 fi
24 24
25 source $CONFIG_FILE 25 source $CONFIG_FILE
26 26
27 # -- DEFAULTS VALUES CONFIGURATION 27 # -- DEFAULTS VALUES CONFIGURATION
28 if [ -z "$VECTOR_FILES_ONE" ] 28 if [ -z "$VECTOR_FILES_ONE" ]
29 then 29 then
30 VECTOR_FILES_ONE=false 30 VECTOR_FILES_ONE=false
31 fi 31 fi
32 32
33 33
34 34
35 # -- MAKE DIRECTORIES 35 # -- MAKE DIRECTORIES
36 if [ ! -d "$OUTDIR" ]; 36 if [ ! -d "$OUTDIR" ];
37 then 37 then
38 mkdir -p $OUTDIR 38 mkdir -p $OUTDIR
39 fi 39 fi
40 40
41 if [ ! -d "${NEW_LSTDIR}" ]; 41 if [ ! -d "${NEW_LSTDIR}" ];
42 then 42 then
43 mkdir -p ${NEW_LSTDIR} 43 mkdir -p ${NEW_LSTDIR}
44 fi 44 fi
45 45
46 46
47 # -- KFOLD MIN and MAX
48 if [ -z "$MIN_KFOLD" ]
49 then
50 MIN_KFOLD=1
51 fi
52
53 if [ -z "$MAX_KFOLD" ]
54 then
55 MAX_KFOLD=4
56 fi
57
47 # -- BEGIN BY KFOLD 58 # -- BEGIN BY KFOLD
48 for kfold in {1..4} 59 for kfold in $(seq ${MIN_KFOLD} ${MAX_KFOLD})
49 do 60 do
50 # Some usefull variable 61 # Some usefull variable
51 CHAR_INFO="${DATADIR}/character_information.csv" 62 CHAR_INFO="${DATADIR}/character_information.csv"
52 TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst" 63 TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst"
53 VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst" 64 VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst"
65 TRAIN_LANG_LST="${NEW_LSTDIR}/train_${kfold}_lang.lst"
66 VAL_LANG_LST="${NEW_LSTDIR}/val_${kfold}_lang.lst"
54 67
55 # Configuration for the run clustering file 68 # Configuration for the run clustering file
56 if [ ${VECTOR_FILES_ONE} == false ] 69 if [ ${VECTOR_FILES_ONE} == false ]
57 then 70 then
58 VECTOR_FILE="${VECTOR_FILES_BEGIN}_${kfold}${VECTOR_FILES_END}" 71 VECTOR_FILE="${VECTOR_FILES_BEGIN}_${kfold}${VECTOR_FILES_END}"
59 fi 72 fi
60 73
61 TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst" 74 TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst"
62 VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst" 75 VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst"
63 EXP_DIR="${OUTDIR}/${kfold}" 76 EXP_DIR="${OUTDIR}/${kfold}"
64 METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst" #* 77 METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst"
65 METAS_CHARACTER="${DATADIR}/masseffect.lst" 78 METAS_CHARACTER="${DATADIR}/masseffect.lst"
66 79 METAS_LANG="${NEW_LSTDIR}/metas_${kfold}_lang.lst"
67 80
68 81
69 if [ ! -d "${EXP_DIR}" ]; 82 if [ ! -d "${EXP_DIR}" ];
70 then 83 then
71 mkdir -p ${EXP_DIR} 84 mkdir -p ${EXP_DIR}
72 fi 85 fi
73 86
74 87
75 # Extract character information 88 # EXTRACT TYPE INFORMATION
76 echo "Extracting character information" 89 echo "Extracting character information"
90 echo "Replace in train"
77 python3 "bin/replace_label.py" \ 91 python3 "bin/replace_label.py" \
78 "${METAS_CHARACTER}" \ 92 "${METAS_CHARACTER}" \
79 "${CHAR_INFO}" \ 93 "${CHAR_INFO}" \
80 --field "type" \ 94 --field "type" \
81 --lst "${TRAIN_LST}" \ 95 --lst "${TRAIN_LST}" \
82 --outfile "${TRAIN_TYPE_LST}" 96 --outfile "${TRAIN_TYPE_LST}"
83 97
98 echo "Replace in val"
84 python3 "bin/replace_label.py" \ 99 python3 "bin/replace_label.py" \
85 "${METAS_CHARACTER}" \ 100 "${METAS_CHARACTER}" \
86 "${CHAR_INFO}" \ 101 "${CHAR_INFO}" \
87 --field "type" \ 102 --field "type" \
88 --lst "${VAL_LST}" \ 103 --lst "${VAL_LST}" \
89 --outfile "${VAL_TYPE_LST}" 104 --outfile "${VAL_TYPE_LST}"
90 105
106 echo "Merge them"
91 cat "${TRAIN_TYPE_LST}" "${VAL_TYPE_LST}" > "${METAS_TYPE}" 107 cat "${TRAIN_TYPE_LST}" "${VAL_TYPE_LST}" > "${METAS_TYPE}"
92 108
109 # EXTRACT LANGUAGE INFORMATION
110 echo "Language info for train"
111 awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST}
112 echo "Language info for val"
113 awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST}
114
115 echo "Merge them"
116 cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}"
117
118 echo "Then Run Clustering"
93 source "run-clustering.sh" 119 source "run-clustering.sh"
94 done 120 done
95 121
96 # Regroup measures with respect to character classes 122 # Regroup measures with respect to character classes
97 echo "Regrouping measures with respect to character classes" 123 echo "Regrouping measures with respect to character classes"
98 python3 "bin/regroup-measures.py" ${OUTDIR} 124 python3 "bin/regroup-measures.py" ${OUTDIR}
99 125
100 # Regroup measures with respect to type classes 126 # Regroup measures with respect to type classes
101 echo "Regrouping measures with respect to type classes" 127 echo "Regrouping measures with respect to type classes"
102 python3 "bin/regroup-measures.py" ${OUTDIR} --suffix "_type" --measurefile "measures_type.json" 128 python3 "bin/regroup-measures.py" ${OUTDIR} --suffix "_type" --measurefile "measures_type.json"
103 129
104 130
105 131