Commit fee5922c3583c647d955c047809e5610ec8d7d63

Authored by Mathias Quillot
1 parent 151e596e35
Exists in master

New way to exec the run file. Now you can run the clustering juste for one model…

…, or use the run file and launch for each fold. You can config it with configuration files in config.

Showing 5 changed files with 198 additions and 76 deletions Inline Diff

config/ivector_config.sh
File was created 1 OUTDIR="exp/kmeans_euclidian/ivectors"
2 DATADIR="data"
3 NEW_LSTDIR="${OUTDIR}/lst"
4
5 VECTOR_FILE="data/ivectors.txt" # To specify if there's only one
6 VECTOR_FILES_ONE=true # Specify there's only one file
7
8 KMIN=2
9 KMAX=100
10
config/pvector_config.sh
File was created 1 OUTDIR="exp/kmeans_euclidian/teacher-pvector-1"
2 DATADIR="data"
3 NEW_LSTDIR="${OUTDIR}/lst"
4
5 VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher"
6 VECTOR_FILES_END=".txt"
7 VECTOR_FILE="" # To specify if there's only one
8 VECTOR_FILES_ONE=false # Specify there's only one file
9
10 KMIN=2
11 KMAX=100
12
config/xvector_config.sh
File was created 1 OUTDIR="exp/kmeans_euclidian/xvectors"
2 DATADIR="data"
3 NEW_LSTDIR="${OUTDIR}/lst"
4
5 VECTOR_FILE="data/xvectors.txt" # To specify if there's only one
6 VECTOR_FILES_ONE=true # Specify there's only one file
7
8 KMIN=2
9 KMAX=100
10
File was created 1 #
2 # This script aims to compute clustering
3 #
4
5
6 # -- CONFIGURATION
7 # THIS SCRIPT NEEDS THESE VARIABLES
8 # Vector file
9 #VECTOR_FILE=""
10 # Train list
11 #TRAIN_LST==""
12 # Val list
13 #VAL_LST=""
14 # Exp directory
15 #EXP_DIR=""
16 # Metas file with type values
17 #METAS_TYPE=""
18 # Metas file with character values
19 #METAS_CHARACTER=""
20
21
22 #echo "VECTOR FILE: $VECTOR_FILE"
23 #echo "TRAIN LIST: $TRAIN_LST"
24 #echo "VAL LIST: $VAL_LST"
25 #echo "EXP DIR: $EXP_DIR"
26 #echo "METAS TYPE: $METAS_TYPE"
27 #echo "METAS_CHARACTER: $METAS_CHARACTER"
28
29
30
31 # -- TRAIN KMEANS
32 echo "Clustering - ${kfold}"
33 python3 bin/cluster_kmeans.py "${VECTOR_FILE}" \
34 "${TRAIN_LST}" \
35 "${EXP_DIR}" --kmin ${KMIN} --kmax ${KMAX}
36
37
38
39 for k in $(seq ${KMIN} 1 ${KMAX})
40 do
41 SUB_EXP_DIR="${EXP_DIR}/${k}"
42
43 # -- EXTRACT KMEANS VALUES
44 echo "Kmeans Measuring and extraction - ${k}"
45 python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \
46 "${VECTOR_FILE}" \
47 --outfile "${SUB_EXP_DIR}/clustered_${k}.txt"
48 # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR
49 # Measures
50 python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
51 "${METAS_CHARACTER}" \
52 "${TRAIN_LST}" \
53 "${VAL_LST}" \
54 --outfile "${SUB_EXP_DIR}/measures.json"
55
56 # Plot count matrix for train
57 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
58 ${VECTOR_FILE} \
59 ${TRAIN_LST} \
60 --outfile "${SUB_EXP_DIR}/train_count_matrix.pdf"
61
62 # Plot count matrix for val
63 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
64 ${VECTOR_FILE} \
65 ${VAL_LST} \
66 --outfile "${SUB_EXP_DIR}/val_count_matrix.pdf"
67
68 # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR
69 # Measures
70 python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
71 "${METAS_TYPE}" \
72 "${TRAIN_LST}" \
73 "${VAL_LST}" \
74 --outfile "${SUB_EXP_DIR}/measures_type.json"
75
76 # This script plot the count matrix of the train set
77 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
78 "${METAS_TYPE}" \
79 "${TRAIN_LST}" \
80 --outfile "${SUB_EXP_DIR}/train_count_matrix_type.pdf"
81
82 # This script plot the count matrix of the validation set
83 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
84 "${METAS_TYPE}" \
85 "${VAL_LST}" \
86 --outfile "${SUB_EXP_DIR}/val_count_matrix_type.pdf"
87
88 done
89
90
1 # Pour le moment, le run ne fait qu'executer
2 # quelques petites commandes que l'on souhaite
3 # tester.
4 1
5 OUTDIR="exp/kmeans_teacher_1/pvector-1" 2 #OUTDIR="exp/test/pvector-2"
6 DATADIR="data" 3 #DATADIR="data"
7 NEW_LSTDIR="${OUTDIR}/lst" 4 #NEW_LSTDIR="${OUTDIR}/lst"
8 5
9 kmin=2 6 #VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher"
10 kmax=100 7 #VECTOR_FILES_END=".txt"
8 #VECTOR_FILE="" # To specify if there's only one
9 #VECTOR_FILES_ONE=false # Specify there's only one file
11 10
11 #KMIN=2
12 #KMAX=100
13
14 # -- LOAD CONFIG FILE
15 CONFIG_FILE="config.sh"
16
17 if [ $# -eq 1 ]
18 then
19 CONFIG_FILE="$1"
20 else
21 echo "Need to have one and only one argument"
22 exit -1
23 fi
24
25 source $CONFIG_FILE
26
27 # -- DEFAULTS VALUES CONFIGURATION
28 if [ -z "$VECTOR_FILES_ONE" ]
29 then
30 VECTOR_FILES_ONE=false
31 fi
32
33
34
35 # -- MAKE DIRECTORIES
12 if [ ! -d "$OUTDIR" ]; 36 if [ ! -d "$OUTDIR" ];
13 then 37 then
14 mkdir -p $OUTDIR 38 mkdir -p $OUTDIR
15 fi 39 fi
16 40
17 if [ ! -d "${NEW_LSTDIR}" ]; 41 if [ ! -d "${NEW_LSTDIR}" ];
18 then 42 then
19 mkdir -p ${NEW_LSTDIR} 43 mkdir -p ${NEW_LSTDIR}
20 fi 44 fi
21 45
22 for kfold in 4 #..4} 46
47 # -- BEGIN BY KFOLD
48 for kfold in {1..4}
23 do 49 do
24 #echo "kfold = ${kfold}" 50 # Some usefull variable
25 pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" 51 CHAR_INFO="${DATADIR}/character_information.csv"
26 lst_dir="${DATADIR}/pvectors_1rst/lst" 52 TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst"
27 output_kfold="${OUTDIR}/${kfold}" 53 VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst"
28 54
29 if [ ! -d "${output_kfold}" ]; 55 # Configuration for the run clustering file
56 if [ ! ${VECTOR_FILES_ONE} ]
30 then 57 then
31 mkdir -p ${output_kfold} 58 VECTOR_FILE="${VECTOR_FILES_BEGIN}_${kfold}${VECTOR_FILES_END}"
32 fi 59 fi
33
34 60
61 TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst"
62 VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst"
63 EXP_DIR="${OUTDIR}/${kfold}"
64 METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst" #*
65 METAS_CHARACTER="${DATADIR}/masseffect.lst"
66
67
68
69 if [ ! -d "${EXP_DIR}" ];
70 then
71 mkdir -p ${EXP_DIR}
72 fi
73
74
35 # Extract character information 75 # Extract character information
36 echo "Extracting character information" 76 echo "Extracting character information"
37 python3 "bin/replace_label.py" \ 77 python3 "bin/replace_label.py" \
38 "${DATADIR}/masseffect.lst" \ 78 "${METAS_CHARACTER}" \
39 "${DATADIR}/character_information.csv" \ 79 "${CHAR_INFO}" \
40 --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \ 80 --field "type" \
41 --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst" 81 --lst "${TRAIN_LST}" \
42 82 --outfile "${TRAIN_TYPE_LST}"
83
43 python3 "bin/replace_label.py" \ 84 python3 "bin/replace_label.py" \
44 "${DATADIR}/masseffect.lst" \ 85 "${METAS_CHARACTER}" \
45 "${DATADIR}/character_information.csv" \ 86 "${CHAR_INFO}" \
46 --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \ 87 --field "type" \
47 --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst" 88 --lst "${VAL_LST}" \
48 cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst" 89 --outfile "${VAL_TYPE_LST}"
49 90
50 # -- TRAIN KMEANS 91 cat "${TRAIN_TYPE_LST}" "${VAL_TYPE_LST}" > "${METAS_TYPE}"
51 echo "Clustering - ${kfold}"
52 python3 bin/cluster_kmeans.py "${pvector_file}" \
53 "${lst_dir}/train_${kfold}.lst" \
54 "${output_kfold}" --kmin ${kmin} --kmax ${kmax}
55 92
56 for k in $(seq ${kmin} 1 ${kmax}) 93 source "run-clustering.sh"
57 do 94 done
58 # -- EXTRACT KMEANS VALUES
59 echo "Kmeans Measuring and extraction - ${k}"
60 python3 bin/extract_kmeans.py "${output_kfold}/${k}/clustering_${k}.pkl" \