diff --git a/bin/replace_label_lst.py b/bin/replace_label_lst.py new file mode 100644 index 0000000..47db163 --- /dev/null +++ b/bin/replace_label_lst.py @@ -0,0 +1,5 @@ + +import argparse + +parser = argparse.ArgumentParser(description="extract label from lst file, move a label in fact") + diff --git a/config/pv_from_xv_config.sh b/config/pv_from_xv_config.sh new file mode 100644 index 0000000..fda429e --- /dev/null +++ b/config/pv_from_xv_config.sh @@ -0,0 +1,13 @@ + +# Framework configuration +OUTDIR="exp/kmeans_euclidian/pv_from_xv" +DATADIR="data" +NEW_LSTDIR="${OUTDIR}/lst" + +VECTOR_FILES_BEGIN="${DATADIR}/pv_from_xv/me_pv_teacher" +VECTOR_FILES_END=".txt" +VECTOR_FILE="" # To specify if there's only one +VECTOR_FILES_ONE=false # Specify there's only one file + +KMIN=2 +KMAX=100 diff --git a/config/pvector_config.sh b/config/pvector_config.sh index 03617b6..e75dae1 100644 --- a/config/pvector_config.sh +++ b/config/pvector_config.sh @@ -1,3 +1,4 @@ + OUTDIR="exp/kmeans_euclidian/teacher-pvector-1" DATADIR="data" NEW_LSTDIR="${OUTDIR}/lst" diff --git a/extract-labels.sh b/extract-labels.sh index 34ea1f4..fa51890 100755 --- a/extract-labels.sh +++ b/extract-labels.sh @@ -2,20 +2,20 @@ # Number of set k=4 +kmean=88 + # Vector features file -VECTOR_FILE_MASSEFFECT="data/pvectors_1rst/pvectors_teacher_${k}.txt" +VECTOR_FILE_MASSEFFECT="data/xvectors.txt" -# Number of clusters -kmean=6 # Dirs -EXP_DIR="exp/kmeans_euclidian/teacher-pvector-1/${k}/${kmean}" +EXP_DIR="exp/kmeans_euclidian/xvectors/${k}/${kmean}" CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl" # Output dirs -OUTFILE_MASSEFFECT="data/pvectors_1rst/saved_clustered/masseffect_clustered_${k}_${kmean}.txt" +OUTFILE_MASSEFFECT="data/xvectors/saved_clustered/masseffect_clustered_xvectors_${k}_${kmean}.txt" python3 bin/extract_kmeans.py "${CLUSTERING}" \ "${VECTOR_FILE_MASSEFFECT}" \ diff --git a/run-clustering.sh b/run-clustering.sh index 7af7fbd..a149c02 100755 --- a/run-clustering.sh +++ b/run-clustering.sh @@ -84,6 +84,27 @@ do "${METAS_TYPE}" \ "${VAL_LST}" \ --outfile "${SUB_EXP_DIR}/val_count_matrix_type.pdf" - + + + # -- MEASURES AND PLOT WITH RESPECT TO LANG VAR + # Measures + python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + "${METAS_LANG}" \ + "${TRAIN_LST}" \ + "${VAL_LST}" \ + --outfile "${SUB_EXP_DIR}/measures_lang.json" + + # This script plot the count matrix of the train set + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + "${METAS_LANG}" \ + "${TRAIN_LST}" \ + --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf" + + # This script plot the count matrix of the validation set + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + "${METAS_LANG}" \ + "${VAL_LST}" \ + --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf" + done diff --git a/run-measures.sh b/run-measures.sh index b2dc722..a328ced 100755 --- a/run-measures.sh +++ b/run-measures.sh @@ -2,7 +2,8 @@ # quelques petites commandes que l'on souhaite # tester. -OUTDIR="exp/kmeans_teacher_1/pvector-1" +OUTDIR="exp/kmeans_euclidian/teacher-pvector-1" +EXP_DIR=${OUTDIR} DATADIR="data" NEW_LSTDIR="${OUTDIR}/lst" @@ -22,6 +23,7 @@ fi for kfold in {1..4} do pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" + VECTOR_FILE=$pvector_file lst_dir="${DATADIR}/pvectors_1rst/lst" output_kfold="${OUTDIR}/${kfold}" @@ -38,41 +40,79 @@ do # --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst" #cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst" + TRAIN_LST=${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst + VAL_LST=${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst + TRAIN_LANG_LST=${NEW_LSTDIR}/train_${kfold}_lang.lst + VAL_LANG_LST=${NEW_LSTDIR}/val_${kfold}_lang.lst + METAS_LANG=${NEW_LSTDIR}/metas_${kfold}_lang.lst + + # EXTRACT LANGUAGE INFORMATION + awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST} + echo "VAL EXTRACT LANGUAGE INFO DONE" + awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST} + echo "TRAIN EXTRACT LANGUAGE INFO DONE" + cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}" + echo "GLOBAL EXTRACT LANGUAGE INFO DONE" + - echo "Clustering - ${kfold}" for k in $(seq ${kmin} 1 ${kmax}) do echo "Kmeans Measuring and ploting - ${k}" - - # This script compute measures from clustering - #python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json" - + + SUB_EXP_DIR="${EXP_DIR}/${kfold}/${k}" + + # -- EXTRACT CLUSTERING LABELS + python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ + "${VECTOR_FILE}" \ + --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" + + # -- MEASURES AND PLOT WITH RESPECT TO LANG VAR + # Measures + python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + "${METAS_LANG}" \ + "${TRAIN_LST}" \ + "${VAL_LST}" \ + --outfile "${SUB_EXP_DIR}/measures_lang.json" + + # This script plot the count matrix of the train set + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + "${METAS_LANG}" \ + "${TRAIN_LST}" \ + --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf" + + # This script plot the count matrix of the validation set + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + "${METAS_LANG}" \ + "${VAL_LST}" \ + --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf" + + rm ${SUB_EXP_DIR}/clustered_${k}.txt #python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \ # "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \ # "${lst_dir}/val_${kfold}.lst" \ # --outfile "${output_kfold}/${k}/measures_type.json" # This script plot the count matrix of the train set - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ - ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \ - --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf + #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ + # ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \ + # --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf # This script plot the count matrix of the validation set - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ - ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \ - --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf + #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ + # ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \ + # --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf # This script plot the count matrix of the train set - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ - ${pvector_file} ${lst_dir}/train_${kfold}.lst \ - --outfile ${output_kfold}/${k}/train_count_matrix.pdf + #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ + # ${pvector_file} ${lst_dir}/train_${kfold}.lst \ + # --outfile ${output_kfold}/${k}/train_count_matrix.pdf # This script plot the count matrix of the validation set - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ - ${pvector_file} ${lst_dir}/val_${kfold}.lst \ - --outfile ${output_kfold}/${k}/val_count_matrix.pdf + #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ + # ${pvector_file} ${lst_dir}/val_${kfold}.lst \ + # --outfile ${output_kfold}/${k}/val_count_matrix.pdf done done diff --git a/run.sh b/run.sh index 4b9b39a..310f58b 100755 --- a/run.sh +++ b/run.sh @@ -44,13 +44,26 @@ then fi +# -- KFOLD MIN and MAX +if [ -z "$MIN_KFOLD" ] +then + MIN_KFOLD=1 +fi + +if [ -z "$MAX_KFOLD" ] +then + MAX_KFOLD=4 +fi + # -- BEGIN BY KFOLD -for kfold in {1..4} +for kfold in $(seq ${MIN_KFOLD} ${MAX_KFOLD}) do # Some usefull variable CHAR_INFO="${DATADIR}/character_information.csv" TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst" VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst" + TRAIN_LANG_LST="${NEW_LSTDIR}/train_${kfold}_lang.lst" + VAL_LANG_LST="${NEW_LSTDIR}/val_${kfold}_lang.lst" # Configuration for the run clustering file if [ ${VECTOR_FILES_ONE} == false ] @@ -61,9 +74,9 @@ do TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst" VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst" EXP_DIR="${OUTDIR}/${kfold}" - METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst" #* + METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst" METAS_CHARACTER="${DATADIR}/masseffect.lst" - + METAS_LANG="${NEW_LSTDIR}/metas_${kfold}_lang.lst" if [ ! -d "${EXP_DIR}" ]; @@ -72,8 +85,9 @@ do fi - # Extract character information + # EXTRACT TYPE INFORMATION echo "Extracting character information" + echo "Replace in train" python3 "bin/replace_label.py" \ "${METAS_CHARACTER}" \ "${CHAR_INFO}" \ @@ -81,6 +95,7 @@ do --lst "${TRAIN_LST}" \ --outfile "${TRAIN_TYPE_LST}" + echo "Replace in val" python3 "bin/replace_label.py" \ "${METAS_CHARACTER}" \ "${CHAR_INFO}" \ @@ -88,8 +103,19 @@ do --lst "${VAL_LST}" \ --outfile "${VAL_TYPE_LST}" + echo "Merge them" cat "${TRAIN_TYPE_LST}" "${VAL_TYPE_LST}" > "${METAS_TYPE}" - + + # EXTRACT LANGUAGE INFORMATION + echo "Language info for train" + awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST} + echo "Language info for val" + awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST} + + echo "Merge them" + cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}" + + echo "Then Run Clustering" source "run-clustering.sh" done