Commit fee5922c3583c647d955c047809e5610ec8d7d63
1 parent
151e596e35
Exists in
master
New way to exec the run file. Now you can run the clustering juste for one model…
…, or use the run file and launch for each fold. You can config it with configuration files in config.
Showing 5 changed files with 198 additions and 76 deletions Side-by-side Diff
config/ivector_config.sh
config/pvector_config.sh
1 | +OUTDIR="exp/kmeans_euclidian/teacher-pvector-1" | |
2 | +DATADIR="data" | |
3 | +NEW_LSTDIR="${OUTDIR}/lst" | |
4 | + | |
5 | +VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" | |
6 | +VECTOR_FILES_END=".txt" | |
7 | +VECTOR_FILE="" # To specify if there's only one | |
8 | +VECTOR_FILES_ONE=false # Specify there's only one file | |
9 | + | |
10 | +KMIN=2 | |
11 | +KMAX=100 |
config/xvector_config.sh
run-clustering.sh
1 | +# | |
2 | +# This script aims to compute clustering | |
3 | +# | |
4 | + | |
5 | + | |
6 | +# -- CONFIGURATION | |
7 | +# THIS SCRIPT NEEDS THESE VARIABLES | |
8 | +# Vector file | |
9 | +#VECTOR_FILE="" | |
10 | +# Train list | |
11 | +#TRAIN_LST=="" | |
12 | +# Val list | |
13 | +#VAL_LST="" | |
14 | +# Exp directory | |
15 | +#EXP_DIR="" | |
16 | +# Metas file with type values | |
17 | +#METAS_TYPE="" | |
18 | +# Metas file with character values | |
19 | +#METAS_CHARACTER="" | |
20 | + | |
21 | + | |
22 | +#echo "VECTOR FILE: $VECTOR_FILE" | |
23 | +#echo "TRAIN LIST: $TRAIN_LST" | |
24 | +#echo "VAL LIST: $VAL_LST" | |
25 | +#echo "EXP DIR: $EXP_DIR" | |
26 | +#echo "METAS TYPE: $METAS_TYPE" | |
27 | +#echo "METAS_CHARACTER: $METAS_CHARACTER" | |
28 | + | |
29 | + | |
30 | + | |
31 | +# -- TRAIN KMEANS | |
32 | +echo "Clustering - ${kfold}" | |
33 | +python3 bin/cluster_kmeans.py "${VECTOR_FILE}" \ | |
34 | + "${TRAIN_LST}" \ | |
35 | + "${EXP_DIR}" --kmin ${KMIN} --kmax ${KMAX} | |
36 | + | |
37 | + | |
38 | + | |
39 | +for k in $(seq ${KMIN} 1 ${KMAX}) | |
40 | +do | |
41 | + SUB_EXP_DIR="${EXP_DIR}/${k}" | |
42 | + | |
43 | + # -- EXTRACT KMEANS VALUES | |
44 | + echo "Kmeans Measuring and extraction - ${k}" | |
45 | + python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ | |
46 | + "${VECTOR_FILE}" \ | |
47 | + --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" | |
48 | + # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR | |
49 | + # Measures | |
50 | + python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
51 | + "${METAS_CHARACTER}" \ | |
52 | + "${TRAIN_LST}" \ | |
53 | + "${VAL_LST}" \ | |
54 | + --outfile "${SUB_EXP_DIR}/measures.json" | |
55 | + | |
56 | + # Plot count matrix for train | |
57 | + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
58 | + ${VECTOR_FILE} \ | |
59 | + ${TRAIN_LST} \ | |
60 | + --outfile "${SUB_EXP_DIR}/train_count_matrix.pdf" | |
61 | + | |
62 | + # Plot count matrix for val | |
63 | + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
64 | + ${VECTOR_FILE} \ | |
65 | + ${VAL_LST} \ | |
66 | + --outfile "${SUB_EXP_DIR}/val_count_matrix.pdf" | |
67 | + | |
68 | + # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR | |
69 | + # Measures | |
70 | + python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
71 | + "${METAS_TYPE}" \ | |
72 | + "${TRAIN_LST}" \ | |
73 | + "${VAL_LST}" \ | |
74 | + --outfile "${SUB_EXP_DIR}/measures_type.json" | |
75 | + | |
76 | + # This script plot the count matrix of the train set | |
77 | + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
78 | + "${METAS_TYPE}" \ | |
79 | + "${TRAIN_LST}" \ | |
80 | + --outfile "${SUB_EXP_DIR}/train_count_matrix_type.pdf" | |
81 | + | |
82 | + # This script plot the count matrix of the validation set | |
83 | + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
84 | + "${METAS_TYPE}" \ | |
85 | + "${VAL_LST}" \ | |
86 | + --outfile "${SUB_EXP_DIR}/val_count_matrix_type.pdf" | |
87 | + | |
88 | +done |
run.sh
1 | -# Pour le moment, le run ne fait qu'executer | |
2 | -# quelques petites commandes que l'on souhaite | |
3 | -# tester. | |
4 | 1 | |
5 | -OUTDIR="exp/kmeans_teacher_1/pvector-1" | |
6 | -DATADIR="data" | |
7 | -NEW_LSTDIR="${OUTDIR}/lst" | |
2 | +#OUTDIR="exp/test/pvector-2" | |
3 | +#DATADIR="data" | |
4 | +#NEW_LSTDIR="${OUTDIR}/lst" | |
8 | 5 | |
9 | -kmin=2 | |
10 | -kmax=100 | |
6 | +#VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" | |
7 | +#VECTOR_FILES_END=".txt" | |
8 | +#VECTOR_FILE="" # To specify if there's only one | |
9 | +#VECTOR_FILES_ONE=false # Specify there's only one file | |
11 | 10 | |
11 | +#KMIN=2 | |
12 | +#KMAX=100 | |
13 | + | |
14 | +# -- LOAD CONFIG FILE | |
15 | +CONFIG_FILE="config.sh" | |
16 | + | |
17 | +if [ $# -eq 1 ] | |
18 | +then | |
19 | + CONFIG_FILE="$1" | |
20 | +else | |
21 | + echo "Need to have one and only one argument" | |
22 | + exit -1 | |
23 | +fi | |
24 | + | |
25 | +source $CONFIG_FILE | |
26 | + | |
27 | +# -- DEFAULTS VALUES CONFIGURATION | |
28 | +if [ -z "$VECTOR_FILES_ONE" ] | |
29 | +then | |
30 | + VECTOR_FILES_ONE=false | |
31 | +fi | |
32 | + | |
33 | + | |
34 | + | |
35 | +# -- MAKE DIRECTORIES | |
12 | 36 | if [ ! -d "$OUTDIR" ]; |
13 | 37 | then |
14 | 38 | mkdir -p $OUTDIR |
15 | 39 | |
16 | 40 | |
17 | 41 | |
18 | 42 | |
19 | 43 | |
20 | 44 | |
21 | 45 | |
22 | 46 | |
23 | 47 | |
24 | 48 | |
25 | 49 | |
... | ... | @@ -19,82 +43,61 @@ |
19 | 43 | mkdir -p ${NEW_LSTDIR} |
20 | 44 | fi |
21 | 45 | |
22 | -for kfold in 4 #..4} | |
46 | + | |
47 | +# -- BEGIN BY KFOLD | |
48 | +for kfold in {1..4} | |
23 | 49 | do |
24 | - #echo "kfold = ${kfold}" | |
25 | - pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" | |
26 | - lst_dir="${DATADIR}/pvectors_1rst/lst" | |
27 | - output_kfold="${OUTDIR}/${kfold}" | |
50 | + # Some usefull variable | |
51 | + CHAR_INFO="${DATADIR}/character_information.csv" | |
52 | + TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst" | |
53 | + VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst" | |
28 | 54 | |
29 | - if [ ! -d "${output_kfold}" ]; | |
55 | + # Configuration for the run clustering file | |
56 | + if [ ! ${VECTOR_FILES_ONE} ] | |
30 | 57 | then |
31 | - mkdir -p ${output_kfold} | |
58 | + VECTOR_FILE="${VECTOR_FILES_BEGIN}_${kfold}${VECTOR_FILES_END}" | |
32 | 59 | fi |
33 | - | |
34 | 60 | |
61 | + TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst" | |
62 | + VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst" | |
63 | + EXP_DIR="${OUTDIR}/${kfold}" | |
64 | + METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst" #* | |
65 | + METAS_CHARACTER="${DATADIR}/masseffect.lst" | |
66 | + | |
67 | + | |
68 | + | |
69 | + if [ ! -d "${EXP_DIR}" ]; | |
70 | + then | |
71 | + mkdir -p ${EXP_DIR} | |
72 | + fi | |
73 | + | |
74 | + | |
35 | 75 | # Extract character information |
36 | 76 | echo "Extracting character information" |
37 | 77 | python3 "bin/replace_label.py" \ |
38 | - "${DATADIR}/masseffect.lst" \ | |
39 | - "${DATADIR}/character_information.csv" \ | |
40 | - --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \ | |
41 | - --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst" | |
42 | - | |
78 | + "${METAS_CHARACTER}" \ | |
79 | + "${CHAR_INFO}" \ | |
80 | + --field "type" \ | |
81 | + --lst "${TRAIN_LST}" \ | |
82 | + --outfile "${TRAIN_TYPE_LST}" | |
83 | + | |
43 | 84 | python3 "bin/replace_label.py" \ |
44 | - "${DATADIR}/masseffect.lst" \ | |
45 | - "${DATADIR}/character_information.csv" \ | |
46 | - --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \ | |
47 | - --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst" | |
48 | - cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst" | |
85 | + "${METAS_CHARACTER}" \ | |
86 | + "${CHAR_INFO}" \ | |
87 | + --field "type" \ | |
88 | + --lst "${VAL_LST}" \ | |
89 | + --outfile "${VAL_TYPE_LST}" | |
49 | 90 | |
50 | - # -- TRAIN KMEANS | |
51 | - echo "Clustering - ${kfold}" | |
52 | - python3 bin/cluster_kmeans.py "${pvector_file}" \ | |
53 | - "${lst_dir}/train_${kfold}.lst" \ | |
54 | - "${output_kfold}" --kmin ${kmin} --kmax ${kmax} | |
91 | + cat "${TRAIN_TYPE_LST}" "${VAL_TYPE_LST}" > "${METAS_TYPE}" | |
55 | 92 | |
56 | - for k in $(seq ${kmin} 1 ${kmax}) | |
57 | - do | |
58 | - # -- EXTRACT KMEANS VALUES | |
59 | - echo "Kmeans Measuring and extraction - ${k}" | |
60 | - python3 bin/extract_kmeans.py "${output_kfold}/${k}/clustering_${k}.pkl" \ | |
61 | - "${pvector_file}" \ | |
62 | - --outfile "${output_kfold}/${k}/clustered_${k}.txt" | |
63 | - | |
64 | - | |
65 | - # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR | |
66 | - # Measures | |
67 | - python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json" | |
68 | - | |
69 | - # Plot count matrix for train | |
70 | - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | |
71 | - ${pvector_file} ${lst_dir}/train_${kfold}.lst \ | |
72 | - --outfile ${output_kfold}/${k}/train_count_matrix.pdf | |
73 | - | |
74 | - # Plot count matrix for val | |
75 | - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | |
76 | - ${pvector_file} ${lst_dir}/val_${kfold}.lst \ | |
77 | - --outfile ${output_kfold}/${k}/val_count_matrix.pdf | |
93 | + source "run-clustering.sh" | |
94 | +done | |
78 | 95 | |
79 | - # Regroup measures with respect to character var | |
80 | - python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/ | |
96 | +# Regroup measures with respect to character classes | |
97 | +echo "Regrouping measures with respect to character classes" | |
98 | +python3 "bin/regroup-measures.py" ${OUTDIR} | |
81 | 99 | |
82 | - # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR | |
83 | - # Measures | |
84 | - python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures_type.json" | |
85 | - | |
86 | - # This script plot the count matrix of the train set | |
87 | - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | |
88 | - ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \ | |
89 | - --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf | |
90 | - | |
91 | - # This script plot the count matrix of the validation set | |
92 | - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | |
93 | - ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \ | |
94 | - --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf | |
95 | - | |
96 | - # Regroup measures with respect to type var | |
97 | - python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/ --suffix "_type" --measurefile "measures_type.j | |
98 | - done | |
99 | -done | |
100 | +# Regroup measures with respect to type classes | |
101 | +echo "Regrouping measures with respect to type classes" | |
102 | +python3 "bin/regroup-measures.py" ${OUTDIR} --suffix "_type" --measurefile "measures_type.json" |