From fee5922c3583c647d955c047809e5610ec8d7d63 Mon Sep 17 00:00:00 2001 From: Mathias Quillot Date: Wed, 24 Jul 2019 23:54:56 +0200 Subject: [PATCH] New way to exec the run file. Now you can run the clustering juste for one model, or use the run file and launch for each fold. You can config it with configuration files in config. --- config/ivector_config.sh | 9 +++ config/pvector_config.sh | 11 ++++ config/xvector_config.sh | 9 +++ run-clustering.sh | 89 +++++++++++++++++++++++++++ run.sh | 156 ++++++++++++++++++++++++----------------------- 5 files changed, 198 insertions(+), 76 deletions(-) create mode 100644 config/ivector_config.sh create mode 100644 config/pvector_config.sh create mode 100644 config/xvector_config.sh create mode 100755 run-clustering.sh diff --git a/config/ivector_config.sh b/config/ivector_config.sh new file mode 100644 index 0000000..883091a --- /dev/null +++ b/config/ivector_config.sh @@ -0,0 +1,9 @@ +OUTDIR="exp/kmeans_euclidian/ivectors" +DATADIR="data" +NEW_LSTDIR="${OUTDIR}/lst" + +VECTOR_FILE="data/ivectors.txt" # To specify if there's only one +VECTOR_FILES_ONE=true # Specify there's only one file + +KMIN=2 +KMAX=100 diff --git a/config/pvector_config.sh b/config/pvector_config.sh new file mode 100644 index 0000000..03617b6 --- /dev/null +++ b/config/pvector_config.sh @@ -0,0 +1,11 @@ +OUTDIR="exp/kmeans_euclidian/teacher-pvector-1" +DATADIR="data" +NEW_LSTDIR="${OUTDIR}/lst" + +VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" +VECTOR_FILES_END=".txt" +VECTOR_FILE="" # To specify if there's only one +VECTOR_FILES_ONE=false # Specify there's only one file + +KMIN=2 +KMAX=100 diff --git a/config/xvector_config.sh b/config/xvector_config.sh new file mode 100644 index 0000000..73a47fd --- /dev/null +++ b/config/xvector_config.sh @@ -0,0 +1,9 @@ +OUTDIR="exp/kmeans_euclidian/xvectors" +DATADIR="data" +NEW_LSTDIR="${OUTDIR}/lst" + +VECTOR_FILE="data/xvectors.txt" # To specify if there's only one +VECTOR_FILES_ONE=true # Specify there's only one file + +KMIN=2 +KMAX=100 diff --git a/run-clustering.sh b/run-clustering.sh new file mode 100755 index 0000000..7af7fbd --- /dev/null +++ b/run-clustering.sh @@ -0,0 +1,89 @@ +# +# This script aims to compute clustering +# + + +# -- CONFIGURATION +# THIS SCRIPT NEEDS THESE VARIABLES +# Vector file +#VECTOR_FILE="" +# Train list +#TRAIN_LST=="" +# Val list +#VAL_LST="" +# Exp directory +#EXP_DIR="" +# Metas file with type values +#METAS_TYPE="" +# Metas file with character values +#METAS_CHARACTER="" + + +#echo "VECTOR FILE: $VECTOR_FILE" +#echo "TRAIN LIST: $TRAIN_LST" +#echo "VAL LIST: $VAL_LST" +#echo "EXP DIR: $EXP_DIR" +#echo "METAS TYPE: $METAS_TYPE" +#echo "METAS_CHARACTER: $METAS_CHARACTER" + + + +# -- TRAIN KMEANS +echo "Clustering - ${kfold}" +python3 bin/cluster_kmeans.py "${VECTOR_FILE}" \ + "${TRAIN_LST}" \ + "${EXP_DIR}" --kmin ${KMIN} --kmax ${KMAX} + + + +for k in $(seq ${KMIN} 1 ${KMAX}) +do + SUB_EXP_DIR="${EXP_DIR}/${k}" + + # -- EXTRACT KMEANS VALUES + echo "Kmeans Measuring and extraction - ${k}" + python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ + "${VECTOR_FILE}" \ + --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" + # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR + # Measures + python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + "${METAS_CHARACTER}" \ + "${TRAIN_LST}" \ + "${VAL_LST}" \ + --outfile "${SUB_EXP_DIR}/measures.json" + + # Plot count matrix for train + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + ${VECTOR_FILE} \ + ${TRAIN_LST} \ + --outfile "${SUB_EXP_DIR}/train_count_matrix.pdf" + + # Plot count matrix for val + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + ${VECTOR_FILE} \ + ${VAL_LST} \ + --outfile "${SUB_EXP_DIR}/val_count_matrix.pdf" + + # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR + # Measures + python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + "${METAS_TYPE}" \ + "${TRAIN_LST}" \ + "${VAL_LST}" \ + --outfile "${SUB_EXP_DIR}/measures_type.json" + + # This script plot the count matrix of the train set + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + "${METAS_TYPE}" \ + "${TRAIN_LST}" \ + --outfile "${SUB_EXP_DIR}/train_count_matrix_type.pdf" + + # This script plot the count matrix of the validation set + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + "${METAS_TYPE}" \ + "${VAL_LST}" \ + --outfile "${SUB_EXP_DIR}/val_count_matrix_type.pdf" + +done + diff --git a/run.sh b/run.sh index 5def9ac..353a9f1 100755 --- a/run.sh +++ b/run.sh @@ -1,14 +1,38 @@ -# Pour le moment, le run ne fait qu'executer -# quelques petites commandes que l'on souhaite -# tester. -OUTDIR="exp/kmeans_teacher_1/pvector-1" -DATADIR="data" -NEW_LSTDIR="${OUTDIR}/lst" +#OUTDIR="exp/test/pvector-2" +#DATADIR="data" +#NEW_LSTDIR="${OUTDIR}/lst" -kmin=2 -kmax=100 +#VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" +#VECTOR_FILES_END=".txt" +#VECTOR_FILE="" # To specify if there's only one +#VECTOR_FILES_ONE=false # Specify there's only one file +#KMIN=2 +#KMAX=100 + +# -- LOAD CONFIG FILE +CONFIG_FILE="config.sh" + +if [ $# -eq 1 ] +then + CONFIG_FILE="$1" +else + echo "Need to have one and only one argument" + exit -1 +fi + +source $CONFIG_FILE + +# -- DEFAULTS VALUES CONFIGURATION +if [ -z "$VECTOR_FILES_ONE" ] +then + VECTOR_FILES_ONE=false +fi + + + +# -- MAKE DIRECTORIES if [ ! -d "$OUTDIR" ]; then mkdir -p $OUTDIR @@ -19,82 +43,62 @@ then mkdir -p ${NEW_LSTDIR} fi -for kfold in 4 #..4} + +# -- BEGIN BY KFOLD +for kfold in {1..4} do - #echo "kfold = ${kfold}" - pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" - lst_dir="${DATADIR}/pvectors_1rst/lst" - output_kfold="${OUTDIR}/${kfold}" + # Some usefull variable + CHAR_INFO="${DATADIR}/character_information.csv" + TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst" + VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst" - if [ ! -d "${output_kfold}" ]; + # Configuration for the run clustering file + if [ ! ${VECTOR_FILES_ONE} ] then - mkdir -p ${output_kfold} + VECTOR_FILE="${VECTOR_FILES_BEGIN}_${kfold}${VECTOR_FILES_END}" fi - + TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst" + VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst" + EXP_DIR="${OUTDIR}/${kfold}" + METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst" #* + METAS_CHARACTER="${DATADIR}/masseffect.lst" + + + + if [ ! -d "${EXP_DIR}" ]; + then + mkdir -p ${EXP_DIR} + fi + + # Extract character information echo "Extracting character information" python3 "bin/replace_label.py" \ - "${DATADIR}/masseffect.lst" \ - "${DATADIR}/character_information.csv" \ - --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \ - --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst" - + "${METAS_CHARACTER}" \ + "${CHAR_INFO}" \ + --field "type" \ + --lst "${TRAIN_LST}" \ + --outfile "${TRAIN_TYPE_LST}" + python3 "bin/replace_label.py" \ - "${DATADIR}/masseffect.lst" \ - "${DATADIR}/character_information.csv" \ - --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \ - --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst" - cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst" - - # -- TRAIN KMEANS - echo "Clustering - ${kfold}" - python3 bin/cluster_kmeans.py "${pvector_file}" \ - "${lst_dir}/train_${kfold}.lst" \ - "${output_kfold}" --kmin ${kmin} --kmax ${kmax} - - for k in $(seq ${kmin} 1 ${kmax}) - do - # -- EXTRACT KMEANS VALUES - echo "Kmeans Measuring and extraction - ${k}" - python3 bin/extract_kmeans.py "${output_kfold}/${k}/clustering_${k}.pkl" \ - "${pvector_file}" \ - --outfile "${output_kfold}/${k}/clustered_${k}.txt" - - - # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR - # Measures - python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json" - - # Plot count matrix for train - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ - ${pvector_file} ${lst_dir}/train_${kfold}.lst \ - --outfile ${output_kfold}/${k}/train_count_matrix.pdf - - # Plot count matrix for val - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ - ${pvector_file} ${lst_dir}/val_${kfold}.lst \ - --outfile ${output_kfold}/${k}/val_count_matrix.pdf - - # Regroup measures with respect to character var - python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/ - - # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR - # Measures - python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures_type.json" - - # This script plot the count matrix of the train set - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ - ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \ - --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf - - # This script plot the count matrix of the validation set - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ - ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \ - --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf - - # Regroup measures with respect to type var - python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/ --suffix "_type" --measurefile "measures_type.j - done + "${METAS_CHARACTER}" \ + "${CHAR_INFO}" \ + --field "type" \ + --lst "${VAL_LST}" \ + --outfile "${VAL_TYPE_LST}" + + cat "${TRAIN_TYPE_LST}" "${VAL_TYPE_LST}" > "${METAS_TYPE}" + + source "run-clustering.sh" done +# Regroup measures with respect to character classes +echo "Regrouping measures with respect to character classes" +python3 "bin/regroup-measures.py" ${OUTDIR} + +# Regroup measures with respect to type classes +echo "Regrouping measures with respect to type classes" +python3 "bin/regroup-measures.py" ${OUTDIR} --suffix "_type" --measurefile "measures_type.json" + + -- 1.8.2.3