diff --git a/config/ivector_config.sh b/config/ivector_config.sh new file mode 100644 index 0000000..883091a --- /dev/null +++ b/config/ivector_config.sh @@ -0,0 +1,9 @@ +OUTDIR="exp/kmeans_euclidian/ivectors" +DATADIR="data" +NEW_LSTDIR="${OUTDIR}/lst" + +VECTOR_FILE="data/ivectors.txt" # To specify if there's only one +VECTOR_FILES_ONE=true # Specify there's only one file + +KMIN=2 +KMAX=100 diff --git a/config/pvector_config.sh b/config/pvector_config.sh new file mode 100644 index 0000000..03617b6 --- /dev/null +++ b/config/pvector_config.sh @@ -0,0 +1,11 @@ +OUTDIR="exp/kmeans_euclidian/teacher-pvector-1" +DATADIR="data" +NEW_LSTDIR="${OUTDIR}/lst" + +VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" +VECTOR_FILES_END=".txt" +VECTOR_FILE="" # To specify if there's only one +VECTOR_FILES_ONE=false # Specify there's only one file + +KMIN=2 +KMAX=100 diff --git a/config/xvector_config.sh b/config/xvector_config.sh new file mode 100644 index 0000000..73a47fd --- /dev/null +++ b/config/xvector_config.sh @@ -0,0 +1,9 @@ +OUTDIR="exp/kmeans_euclidian/xvectors" +DATADIR="data" +NEW_LSTDIR="${OUTDIR}/lst" + +VECTOR_FILE="data/xvectors.txt" # To specify if there's only one +VECTOR_FILES_ONE=true # Specify there's only one file + +KMIN=2 +KMAX=100 diff --git a/run-clustering.sh b/run-clustering.sh new file mode 100755 index 0000000..7af7fbd --- /dev/null +++ b/run-clustering.sh @@ -0,0 +1,89 @@ +# +# This script aims to compute clustering +# + + +# -- CONFIGURATION +# THIS SCRIPT NEEDS THESE VARIABLES +# Vector file +#VECTOR_FILE="" +# Train list +#TRAIN_LST=="" +# Val list +#VAL_LST="" +# Exp directory +#EXP_DIR="" +# Metas file with type values +#METAS_TYPE="" +# Metas file with character values +#METAS_CHARACTER="" + + +#echo "VECTOR FILE: $VECTOR_FILE" +#echo "TRAIN LIST: $TRAIN_LST" +#echo "VAL LIST: $VAL_LST" +#echo "EXP DIR: $EXP_DIR" +#echo "METAS TYPE: $METAS_TYPE" +#echo "METAS_CHARACTER: $METAS_CHARACTER" + + + +# -- TRAIN KMEANS +echo "Clustering - ${kfold}" +python3 bin/cluster_kmeans.py "${VECTOR_FILE}" \ + "${TRAIN_LST}" \ + "${EXP_DIR}" --kmin ${KMIN} --kmax ${KMAX} + + + +for k in $(seq ${KMIN} 1 ${KMAX}) +do + SUB_EXP_DIR="${EXP_DIR}/${k}" + + # -- EXTRACT KMEANS VALUES + echo "Kmeans Measuring and extraction - ${k}" + python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ + "${VECTOR_FILE}" \ + --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" + # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR + # Measures + python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + "${METAS_CHARACTER}" \ + "${TRAIN_LST}" \ + "${VAL_LST}" \ + --outfile "${SUB_EXP_DIR}/measures.json" + + # Plot count matrix for train + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + ${VECTOR_FILE} \ + ${TRAIN_LST} \ + --outfile "${SUB_EXP_DIR}/train_count_matrix.pdf" + + # Plot count matrix for val + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + ${VECTOR_FILE} \ + ${VAL_LST} \ + --outfile "${SUB_EXP_DIR}/val_count_matrix.pdf" + + # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR + # Measures + python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + "${METAS_TYPE}" \ + "${TRAIN_LST}" \ + "${VAL_LST}" \ + --outfile "${SUB_EXP_DIR}/measures_type.json" + + # This script plot the count matrix of the train set + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + "${METAS_TYPE}" \ + "${TRAIN_LST}" \ + --outfile "${SUB_EXP_DIR}/train_count_matrix_type.pdf" + + # This script plot the count matrix of the validation set + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + "${METAS_TYPE}" \ + "${VAL_LST}" \ + --outfile "${SUB_EXP_DIR}/val_count_matrix_type.pdf" + +done + diff --git a/run.sh b/run.sh index 5def9ac..353a9f1 100755 --- a/run.sh +++ b/run.sh @@ -1,14 +1,38 @@ -# Pour le moment, le run ne fait qu'executer -# quelques petites commandes que l'on souhaite -# tester. -OUTDIR="exp/kmeans_teacher_1/pvector-1" -DATADIR="data" -NEW_LSTDIR="${OUTDIR}/lst" +#OUTDIR="exp/test/pvector-2" +#DATADIR="data" +#NEW_LSTDIR="${OUTDIR}/lst" -kmin=2 -kmax=100 +#VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" +#VECTOR_FILES_END=".txt" +#VECTOR_FILE="" # To specify if there's only one +#VECTOR_FILES_ONE=false # Specify there's only one file +#KMIN=2 +#KMAX=100 + +# -- LOAD CONFIG FILE +CONFIG_FILE="config.sh" + +if [ $# -eq 1 ] +then + CONFIG_FILE="$1" +else + echo "Need to have one and only one argument" + exit -1 +fi + +source $CONFIG_FILE + +# -- DEFAULTS VALUES CONFIGURATION +if [ -z "$VECTOR_FILES_ONE" ] +then + VECTOR_FILES_ONE=false +fi + + + +# -- MAKE DIRECTORIES if [ ! -d "$OUTDIR" ]; then mkdir -p $OUTDIR @@ -19,82 +43,62 @@ then mkdir -p ${NEW_LSTDIR} fi -for kfold in 4 #..4} + +# -- BEGIN BY KFOLD +for kfold in {1..4} do - #echo "kfold = ${kfold}" - pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" - lst_dir="${DATADIR}/pvectors_1rst/lst" - output_kfold="${OUTDIR}/${kfold}" + # Some usefull variable + CHAR_INFO="${DATADIR}/character_information.csv" + TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst" + VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst" - if [ ! -d "${output_kfold}" ]; + # Configuration for the run clustering file + if [ ! ${VECTOR_FILES_ONE} ] then - mkdir -p ${output_kfold} + VECTOR_FILE="${VECTOR_FILES_BEGIN}_${kfold}${VECTOR_FILES_END}" fi - + TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst" + VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst" + EXP_DIR="${OUTDIR}/${kfold}" + METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst" #* + METAS_CHARACTER="${DATADIR}/masseffect.lst" + + + + if [ ! -d "${EXP_DIR}" ]; + then + mkdir -p ${EXP_DIR} + fi + + # Extract character information echo "Extracting character information" python3 "bin/replace_label.py" \ - "${DATADIR}/masseffect.lst" \ - "${DATADIR}/character_information.csv" \ - --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \ - --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst" - + "${METAS_CHARACTER}" \ + "${CHAR_INFO}" \ + --field "type" \ + --lst "${TRAIN_LST}" \ + --outfile "${TRAIN_TYPE_LST}" + python3 "bin/replace_label.py" \ - "${DATADIR}/masseffect.lst" \ - "${DATADIR}/character_information.csv" \ - --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \ - --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst" - cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst" - - # -- TRAIN KMEANS - echo "Clustering - ${kfold}" - python3 bin/cluster_kmeans.py "${pvector_file}" \ - "${lst_dir}/train_${kfold}.lst" \ - "${output_kfold}" --kmin ${kmin} --kmax ${kmax} - - for k in $(seq ${kmin} 1 ${kmax}) - do - # -- EXTRACT KMEANS VALUES - echo "Kmeans Measuring and extraction - ${k}" - python3 bin/extract_kmeans.py "${output_kfold}/${k}/clustering_${k}.pkl" \ - "${pvector_file}" \ - --outfile "${output_kfold}/${k}/clustered_${k}.txt" - - - # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR - # Measures - python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json" - - # Plot count matrix for train - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ - ${pvector_file} ${lst_dir}/train_${kfold}.lst \ - --outfile ${output_kfold}/${k}/train_count_matrix.pdf - - # Plot count matrix for val - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ - ${pvector_file} ${lst_dir}/val_${kfold}.lst \ - --outfile ${output_kfold}/${k}/val_count_matrix.pdf - - # Regroup measures with respect to character var - python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/ - - # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR - # Measures - python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures_type.json" - - # This script plot the count matrix of the train set - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ - ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \ - --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf - - # This script plot the count matrix of the validation set - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ - ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \ - --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf - - # Regroup measures with respect to type var - python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/ --suffix "_type" --measurefile "measures_type.j - done + "${METAS_CHARACTER}" \ + "${CHAR_INFO}" \ + --field "type" \ + --lst "${VAL_LST}" \ + --outfile "${VAL_TYPE_LST}" + + cat "${TRAIN_TYPE_LST}" "${VAL_TYPE_LST}" > "${METAS_TYPE}" + + source "run-clustering.sh" done +# Regroup measures with respect to character classes +echo "Regrouping measures with respect to character classes" +python3 "bin/regroup-measures.py" ${OUTDIR} + +# Regroup measures with respect to type classes +echo "Regrouping measures with respect to type classes" +python3 "bin/regroup-measures.py" ${OUTDIR} --suffix "_type" --measurefile "measures_type.json" + +