diff --git a/README.md b/README.md index 2044ac3..4bf6774 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,11 @@ # Clustering A repository where i put everything dealing with clustering algorithms. +# How to use +You can run directly the run.sh script if you want. You just need data. + +You can use some scripts in utils tool, but run these scripts from the root directory "clustering/". + # TODO - Organiser les différentes listes de données pour mes expériences - Create a data file example diff --git a/bin/regroup-measures.py b/bin/regroup-measures.py index 863932d..bad2306 100644 --- a/bin/regroup-measures.py +++ b/bin/regroup-measures.py @@ -40,6 +40,8 @@ def save_results(outfile, measures, titles): # -- PARSER parser = argparse.ArgumentParser(description="") parser.add_argument("expdir", type=str, help="Directory of experiment") +parser.add_argument("--nkfold", type=int, default=4, help="number of kfold") +parser.add_argument("--nkfoldmin", type=int, default=1, help="Begin with this numero of kfold") parser.add_argument("--measurefile", type=str, default="measures.json", help="Measure file it searchs in folders") parser.add_argument("--suffix", type=str, default="", @@ -49,6 +51,8 @@ args = parser.parse_args() EXP_DIR = args.expdir MEASURE_FILE = args.measurefile SUFFIX = args.suffix +MAX_KFOLD = args.nkfold +MIN_KFOLD = args.nkfoldmin # EXP_DIR="exp/kmeans_teacher_1/pvector-1" RESULTS_DIR = os.path.join(EXP_DIR, "res") @@ -83,7 +87,7 @@ def init_measures(): measures = init_measures() -for kfold in range(1, 5): +for kfold in range(MIN_KFOLD, MAX_KFOLD + 1): print("Regrouping on kfold: " + str(kfold)) # -- REGROUP MEASURES INTO LISTS for k in range(kmin, kmax+1): diff --git a/bin/replace-features.py b/bin/replace-features.py new file mode 100644 index 0000000..693ae97 --- /dev/null +++ b/bin/replace-features.py @@ -0,0 +1,28 @@ + +import argparse + +from data import read_file, index_by_id, write_line + +# -- ARGPARSE +parser = argparse.ArgumentParser( + description="Replace features with file from to file to") +parser.add_argument("fromfile", type=str, help="From list or features file") +parser.add_argument("tofile", type=str, help="Features of 'from' saved into this file.") + +args = parser.parse_args() +FROM = args.fromfile +TO = args.tofile + + +# -- READ AND INDEX FILES +from_data = read_file(FROM) +from_by_id = index_by_id(from_data) + +to_data = read_file(TO) + +with open(TO, "w") as f: + for line in to_data: + metas = line[0] + features = from_by_id[metas[0]][metas[3]][1] + write_line(metas, features, f) + diff --git a/config/archives/ivector_config.sh b/config/archives/ivector_config.sh new file mode 100644 index 0000000..883091a --- /dev/null +++ b/config/archives/ivector_config.sh @@ -0,0 +1,9 @@ +OUTDIR="exp/kmeans_euclidian/ivectors" +DATADIR="data" +NEW_LSTDIR="${OUTDIR}/lst" + +VECTOR_FILE="data/ivectors.txt" # To specify if there's only one +VECTOR_FILES_ONE=true # Specify there's only one file + +KMIN=2 +KMAX=100 diff --git a/config/archives/pv_from_xv_config.sh b/config/archives/pv_from_xv_config.sh new file mode 100644 index 0000000..fda429e --- /dev/null +++ b/config/archives/pv_from_xv_config.sh @@ -0,0 +1,13 @@ + +# Framework configuration +OUTDIR="exp/kmeans_euclidian/pv_from_xv" +DATADIR="data" +NEW_LSTDIR="${OUTDIR}/lst" + +VECTOR_FILES_BEGIN="${DATADIR}/pv_from_xv/me_pv_teacher" +VECTOR_FILES_END=".txt" +VECTOR_FILE="" # To specify if there's only one +VECTOR_FILES_ONE=false # Specify there's only one file + +KMIN=2 +KMAX=100 diff --git a/config/archives/pvector_config.sh b/config/archives/pvector_config.sh new file mode 100644 index 0000000..e75dae1 --- /dev/null +++ b/config/archives/pvector_config.sh @@ -0,0 +1,12 @@ + +OUTDIR="exp/kmeans_euclidian/teacher-pvector-1" +DATADIR="data" +NEW_LSTDIR="${OUTDIR}/lst" + +VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" +VECTOR_FILES_END=".txt" +VECTOR_FILE="" # To specify if there's only one +VECTOR_FILES_ONE=false # Specify there's only one file + +KMIN=2 +KMAX=100 diff --git a/config/archives/pvector_layer1_config.sh b/config/archives/pvector_layer1_config.sh new file mode 100644 index 0000000..c60e2af --- /dev/null +++ b/config/archives/pvector_layer1_config.sh @@ -0,0 +1,11 @@ +OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer1" +DATADIR="data" +NEW_LSTDIR="${OUTDIR}/lst" + +VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_1" +VECTOR_FILES_END=".txt" +VECTOR_FILE="" # To specify if there's only one +VECTOR_FILES_ONE=false # Specify there's only one file + +KMIN=2 +KMAX=100 diff --git a/config/archives/pvector_layer2_config.sh b/config/archives/pvector_layer2_config.sh new file mode 100644 index 0000000..79e5d12 --- /dev/null +++ b/config/archives/pvector_layer2_config.sh @@ -0,0 +1,11 @@ +OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer2" +DATADIR="data" +NEW_LSTDIR="${OUTDIR}/lst" + +VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_2" +VECTOR_FILES_END=".txt" +VECTOR_FILE="" # To specify if there's only one +VECTOR_FILES_ONE=false # Specify there's only one file + +KMIN=2 +KMAX=100 diff --git a/config/archives/pvector_layer3_config.sh b/config/archives/pvector_layer3_config.sh new file mode 100644 index 0000000..dc7ec52 --- /dev/null +++ b/config/archives/pvector_layer3_config.sh @@ -0,0 +1,11 @@ +OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer3" +DATADIR="data" +NEW_LSTDIR="${OUTDIR}/lst" + +VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_3" +VECTOR_FILES_END=".txt" +VECTOR_FILE="" # To specify if there's only one +VECTOR_FILES_ONE=false # Specify there's only one file + +KMIN=2 +KMAX=100 diff --git a/config/archives/pvector_layer4_config.sh b/config/archives/pvector_layer4_config.sh new file mode 100644 index 0000000..33e03d1 --- /dev/null +++ b/config/archives/pvector_layer4_config.sh @@ -0,0 +1,11 @@ +OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer4" +DATADIR="data" +NEW_LSTDIR="${OUTDIR}/lst" + +VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_4" +VECTOR_FILES_END=".txt" +VECTOR_FILE="" # To specify if there's only one +VECTOR_FILES_ONE=false # Specify there's only one file + +KMIN=2 +KMAX=100 diff --git a/config/archives/xvector_config.sh b/config/archives/xvector_config.sh new file mode 100644 index 0000000..73a47fd --- /dev/null +++ b/config/archives/xvector_config.sh @@ -0,0 +1,9 @@ +OUTDIR="exp/kmeans_euclidian/xvectors" +DATADIR="data" +NEW_LSTDIR="${OUTDIR}/lst" + +VECTOR_FILE="data/xvectors.txt" # To specify if there's only one +VECTOR_FILES_ONE=true # Specify there's only one file + +KMIN=2 +KMAX=100 diff --git a/config/config_iv.sh b/config/config_iv.sh new file mode 100644 index 0000000..9a7a7dd --- /dev/null +++ b/config/config_iv.sh @@ -0,0 +1,15 @@ +OUTDIR="exp/kmeans_euclidian/iv" +DATADIR="data" +NEW_LSTDIR="${OUTDIR}/lst" + +VECTOR_FILE="data/ivectors.txt" # To specify if there's only one +VECTOR_FILES_ONE=true # Specify there's only one file + +METAS_CHARACTER="data/masseffect.lst" +CHAR_INFO="data/masseffect_character_information.csv" + +ORIGINAL_VECTOR_FILE="${VECTOR_FILE}" + +KMIN=2 +KMAX=100 + diff --git a/config/config_iv_skyrim.sh b/config/config_iv_skyrim.sh new file mode 100644 index 0000000..7994a56 --- /dev/null +++ b/config/config_iv_skyrim.sh @@ -0,0 +1,15 @@ +OUTDIR="exp/kmeans_euclidian_skyrim/iv" +DATADIR="data" +NEW_LSTDIR="${OUTDIR}/lst" + +VECTOR_FILE="../data/skyrim/skyrim_ivectors.txt" # To specify if there's only one +VECTOR_FILES_ONE=true # Specify there's only one file + +METAS_CHARACTER="../data/skyrim/skyrim.lst" +CHAR_INFO="data/skyrim_character_information.csv" + +ORIGINAL_VECTOR_FILE="${VECTOR_FILE}" + +KMIN=2 +KMAX=100 + diff --git a/config/config_pv_from_iv.sh b/config/config_pv_from_iv.sh new file mode 100644 index 0000000..ef3cb20 --- /dev/null +++ b/config/config_pv_from_iv.sh @@ -0,0 +1,27 @@ + +if [ -z "$kfold" ] +then + kfold=1 +fi + +if [ -z "${t}" ] +then + t=2.0 +fi + +OUTDIR="exp/kmeans_euclidian/pv_from_iv/${kfold}" +DATADIR="data" +MOTHER_LST_DIR="/local_disk/pegasus/laboinfo/mquillot/vocal_similarity_system/data/prot_alpha" +NEW_LSTDIR="${OUTDIR}/lst" + + +VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/exp/kd_iv/${kfold}/${t}/teacher/masseffect_pvectors.txt" # To specify if there's only one +VECTOR_FILES_ONE=true # Specify there's only one file +ORIGINAL_VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/data/masseffect.txt" + + +MIN_KFOLD=${kfold} +MAX_KFOLD=${kfold} + +KMIN=2 +KMAX=100 diff --git a/config/config_pv_from_xv.sh b/config/config_pv_from_xv.sh new file mode 100644 index 0000000..aff2cc1 --- /dev/null +++ b/config/config_pv_from_xv.sh @@ -0,0 +1,26 @@ + +if [ -z "$kfold" ] +then + kfold=1 +fi + +if [ -z "${t}" ] +then + t=2.0 +fi + +OUTDIR="exp/kmeans_euclidian/pv_from_xv/${kfold}" +DATADIR="data" +MOTHER_LST_DIR="/local_disk/pegasus/laboinfo/mquillot/vocal_similarity_system/data/prot_alpha" +NEW_LSTDIR="${OUTDIR}/lst" + + +VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/exp/kd_xvectors/${kfold}/${t}/teacher/masseffect_pvectors.txt" # To specify if there's only one +VECTOR_FILES_ONE=true # Specify there's only one file +ORIGINAL_VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/data/masseffect_xvectors.txt" + +MIN_KFOLD=${kfold} +MAX_KFOLD=${kfold} + +KMIN=2 +KMAX=100 diff --git a/config/config_without_kfold_iv.sh b/config/config_without_kfold_iv.sh new file mode 100644 index 0000000..340bb18 --- /dev/null +++ b/config/config_without_kfold_iv.sh @@ -0,0 +1,13 @@ +OUTDIR="exp/kmeans_euclidian_skyrim/ivectors" +DATADIR="data" +NEW_LSTDIR="${OUTDIR}/lst" + +LST_FILE="/local_disk/pegasus/laboinfo/mquillot/data/skyrim/skyrim_ivectors.txt" +VECTOR_FILE="data/ivectors.txt" # To specify if there's only one +VECTOR_FILES_ONE=true # Specify there's only one file + +WITHOUT_KFOLD="" +KMIN=2 +KMAX=100 + +METAS_CHARACTER="" \ No newline at end of file diff --git a/config/config_xv.sh b/config/config_xv.sh new file mode 100644 index 0000000..fc47378 --- /dev/null +++ b/config/config_xv.sh @@ -0,0 +1,10 @@ +OUTDIR="exp/kmeans_euclidian/xv" +DATADIR="data" +NEW_LSTDIR="${OUTDIR}/lst" + +VECTOR_FILE="data/xvectors.txt" # To specify if there's only one +VECTOR_FILES_ONE=true # Specify there's only one file + +ORIGINAL_VECTOR_FILE="${VECTOR_FILE}" +KMIN=2 +KMAX=100 diff --git a/config/ivector_config.sh b/config/ivector_config.sh deleted file mode 100644 index 883091a..0000000 --- a/config/ivector_config.sh +++ /dev/null @@ -1,9 +0,0 @@ -OUTDIR="exp/kmeans_euclidian/ivectors" -DATADIR="data" -NEW_LSTDIR="${OUTDIR}/lst" - -VECTOR_FILE="data/ivectors.txt" # To specify if there's only one -VECTOR_FILES_ONE=true # Specify there's only one file - -KMIN=2 -KMAX=100 diff --git a/config/pv_from_xv_config.sh b/config/pv_from_xv_config.sh deleted file mode 100644 index fda429e..0000000 --- a/config/pv_from_xv_config.sh +++ /dev/null @@ -1,13 +0,0 @@ - -# Framework configuration -OUTDIR="exp/kmeans_euclidian/pv_from_xv" -DATADIR="data" -NEW_LSTDIR="${OUTDIR}/lst" - -VECTOR_FILES_BEGIN="${DATADIR}/pv_from_xv/me_pv_teacher" -VECTOR_FILES_END=".txt" -VECTOR_FILE="" # To specify if there's only one -VECTOR_FILES_ONE=false # Specify there's only one file - -KMIN=2 -KMAX=100 diff --git a/config/pvector_config.sh b/config/pvector_config.sh deleted file mode 100644 index e75dae1..0000000 --- a/config/pvector_config.sh +++ /dev/null @@ -1,12 +0,0 @@ - -OUTDIR="exp/kmeans_euclidian/teacher-pvector-1" -DATADIR="data" -NEW_LSTDIR="${OUTDIR}/lst" - -VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" -VECTOR_FILES_END=".txt" -VECTOR_FILE="" # To specify if there's only one -VECTOR_FILES_ONE=false # Specify there's only one file - -KMIN=2 -KMAX=100 diff --git a/config/pvector_layer1_config.sh b/config/pvector_layer1_config.sh deleted file mode 100644 index c60e2af..0000000 --- a/config/pvector_layer1_config.sh +++ /dev/null @@ -1,11 +0,0 @@ -OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer1" -DATADIR="data" -NEW_LSTDIR="${OUTDIR}/lst" - -VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_1" -VECTOR_FILES_END=".txt" -VECTOR_FILE="" # To specify if there's only one -VECTOR_FILES_ONE=false # Specify there's only one file - -KMIN=2 -KMAX=100 diff --git a/config/pvector_layer2_config.sh b/config/pvector_layer2_config.sh deleted file mode 100644 index 79e5d12..0000000 --- a/config/pvector_layer2_config.sh +++ /dev/null @@ -1,11 +0,0 @@ -OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer2" -DATADIR="data" -NEW_LSTDIR="${OUTDIR}/lst" - -VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_2" -VECTOR_FILES_END=".txt" -VECTOR_FILE="" # To specify if there's only one -VECTOR_FILES_ONE=false # Specify there's only one file - -KMIN=2 -KMAX=100 diff --git a/config/pvector_layer3_config.sh b/config/pvector_layer3_config.sh deleted file mode 100644 index dc7ec52..0000000 --- a/config/pvector_layer3_config.sh +++ /dev/null @@ -1,11 +0,0 @@ -OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer3" -DATADIR="data" -NEW_LSTDIR="${OUTDIR}/lst" - -VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_3" -VECTOR_FILES_END=".txt" -VECTOR_FILE="" # To specify if there's only one -VECTOR_FILES_ONE=false # Specify there's only one file - -KMIN=2 -KMAX=100 diff --git a/config/pvector_layer4_config.sh b/config/pvector_layer4_config.sh deleted file mode 100644 index 33e03d1..0000000 --- a/config/pvector_layer4_config.sh +++ /dev/null @@ -1,11 +0,0 @@ -OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer4" -DATADIR="data" -NEW_LSTDIR="${OUTDIR}/lst" - -VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_4" -VECTOR_FILES_END=".txt" -VECTOR_FILE="" # To specify if there's only one -VECTOR_FILES_ONE=false # Specify there's only one file - -KMIN=2 -KMAX=100 diff --git a/config/xvector_config.sh b/config/xvector_config.sh deleted file mode 100644 index 73a47fd..0000000 --- a/config/xvector_config.sh +++ /dev/null @@ -1,9 +0,0 @@ -OUTDIR="exp/kmeans_euclidian/xvectors" -DATADIR="data" -NEW_LSTDIR="${OUTDIR}/lst" - -VECTOR_FILE="data/xvectors.txt" # To specify if there's only one -VECTOR_FILES_ONE=true # Specify there's only one file - -KMIN=2 -KMAX=100 diff --git a/extract-labels-pv-from-xv.sh b/extract-labels-pv-from-xv.sh deleted file mode 100755 index 6565075..0000000 --- a/extract-labels-pv-from-xv.sh +++ /dev/null @@ -1,27 +0,0 @@ - - -# Number of set -k=4 - - -# Vector features file -DATADIR="data" - -VECTOR_FILE_MASSEFFECT="${DATADIR}/xvectors.txt" - -for kmean in 12 41 45 50 6 69 72 88 -do - echo "KMEAN: ${kmean}" - # Dirs - EXP_DIR="exp/kmeans_euclidian/pv_from_xv/${k}/${kmean}" - CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl" - - - # Output dirs - OUTFILE_MASSEFFECT="data/pv_from_xv/saved_clustered/masseffect_clustered_${k}_${kmean}.txt" - echo "Extracting" - python3 bin/extract_kmeans.py "${CLUSTERING}" \ - "${VECTOR_FILE_MASSEFFECT}" \ - --outfile "$OUTFILE_MASSEFFECT" - echo "End extracting" -done diff --git a/extract-labels.sh b/extract-labels.sh deleted file mode 100755 index fa51890..0000000 --- a/extract-labels.sh +++ /dev/null @@ -1,22 +0,0 @@ - - -# Number of set -k=4 -kmean=88 - - -# Vector features file -VECTOR_FILE_MASSEFFECT="data/xvectors.txt" - - -# Dirs -EXP_DIR="exp/kmeans_euclidian/xvectors/${k}/${kmean}" -CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl" - - -# Output dirs -OUTFILE_MASSEFFECT="data/xvectors/saved_clustered/masseffect_clustered_xvectors_${k}_${kmean}.txt" - -python3 bin/extract_kmeans.py "${CLUSTERING}" \ - "${VECTOR_FILE_MASSEFFECT}" \ - --outfile "$OUTFILE_MASSEFFECT" diff --git a/rm-unused-files.sh b/rm-unused-files.sh deleted file mode 100755 index 86fe846..0000000 --- a/rm-unused-files.sh +++ /dev/null @@ -1,16 +0,0 @@ - -if [ $# -eq 1 ] -then - EXP_DIR="$1" -else - echo "Need to have one and only one argument. This argument is the exp directory." - exit 1 -fi - -for kfold in {1..4} -do - for k in {1..100} - do - rm ${EXP_DIR}/$kfold/$k/clustered_$k.txt - done -done diff --git a/run-clustering.sh b/run-clustering.sh index a149c02..fb089b1 100755 --- a/run-clustering.sh +++ b/run-clustering.sh @@ -29,7 +29,7 @@ # -- TRAIN KMEANS -echo "Clustering - ${kfold}" +echo "Clustering - ${kfold}"sss python3 bin/cluster_kmeans.py "${VECTOR_FILE}" \ "${TRAIN_LST}" \ "${EXP_DIR}" --kmin ${KMIN} --kmax ${KMAX} diff --git a/run-measures.sh b/run-measures.sh index a328ced..42a38ce 100755 --- a/run-measures.sh +++ b/run-measures.sh @@ -2,7 +2,9 @@ # quelques petites commandes que l'on souhaite # tester. -OUTDIR="exp/kmeans_euclidian/teacher-pvector-1" +set -e + +OUTDIR="exp/kmeans_euclidian/ivectors" EXP_DIR=${OUTDIR} DATADIR="data" NEW_LSTDIR="${OUTDIR}/lst" @@ -22,8 +24,8 @@ fi for kfold in {1..4} do - pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" - VECTOR_FILE=$pvector_file + #pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" + VECTOR_FILE="${DATADIR}/ivectors.txt" lst_dir="${DATADIR}/pvectors_1rst/lst" output_kfold="${OUTDIR}/${kfold}" @@ -61,58 +63,19 @@ do do echo "Kmeans Measuring and ploting - ${k}" - SUB_EXP_DIR="${EXP_DIR}/${kfold}/${k}" - - # -- EXTRACT CLUSTERING LABELS - python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ - "${VECTOR_FILE}" \ - --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" - - # -- MEASURES AND PLOT WITH RESPECT TO LANG VAR - # Measures - python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ - "${METAS_LANG}" \ - "${TRAIN_LST}" \ - "${VAL_LST}" \ - --outfile "${SUB_EXP_DIR}/measures_lang.json" + SUB_EXP_DIR="${EXP_DIR}/${kfold}/${k}" - # This script plot the count matrix of the train set - python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ - "${METAS_LANG}" \ - "${TRAIN_LST}" \ - --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf" + # -- EXTRACT CLUSTERING LABELS + python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ + "${VECTOR_FILE}" \ + --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" - # This script plot the count matrix of the validation set - python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ - "${METAS_LANG}" \ - "${VAL_LST}" \ - --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf" + # -- MEASURES AND PLOT + source steps/measure_clustering_char.sh + source steps/measure_clustering_type.sh + source steps/measure_clustering_lang.sh - rm ${SUB_EXP_DIR}/clustered_${k}.txt - #python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \ - # "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \ - # "${lst_dir}/val_${kfold}.lst" \ - # --outfile "${output_kfold}/${k}/measures_type.json" - - # This script plot the count matrix of the train set - #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ - # ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \ - # --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf - - # This script plot the count matrix of the validation set - #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ - # ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \ - # --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf - - # This script plot the count matrix of the train set - #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ - # ${pvector_file} ${lst_dir}/train_${kfold}.lst \ - # --outfile ${output_kfold}/${k}/train_count_matrix.pdf - - # This script plot the count matrix of the validation set - #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ - # ${pvector_file} ${lst_dir}/val_${kfold}.lst \ - # --outfile ${output_kfold}/${k}/val_count_matrix.pdf + rm ${SUB_EXP_DIR}/clustered_${k}.txt done done diff --git a/run-skyrim.sh b/run-skyrim.sh new file mode 100644 index 0000000..e25f280 --- /dev/null +++ b/run-skyrim.sh @@ -0,0 +1 @@ +python bin/cluster_kmeans.py ../data/skyrim/skyrim_ivectors.txt ../data/skyrim/skyrim.lst exp/kmeans_euclidian_skyrim/ivectors/ --kmin 1 --kmax 100 diff --git a/run.sh b/run.sh index 310f58b..f734200 100755 --- a/run.sh +++ b/run.sh @@ -31,6 +31,16 @@ then fi +if [ -z "$METAS_CHARACTER" ] +then + METAS_CHARACTER="${DATADIR}/masseffect.lst" +fi + + +if [ -z "$CHAR_INFO" ] +then + CHAR_INFO="${DATADIR}/character_information.csv" +fi # -- MAKE DIRECTORIES if [ ! -d "$OUTDIR" ]; @@ -59,7 +69,6 @@ fi for kfold in $(seq ${MIN_KFOLD} ${MAX_KFOLD}) do # Some usefull variable - CHAR_INFO="${DATADIR}/character_information.csv" TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst" VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst" TRAIN_LANG_LST="${NEW_LSTDIR}/train_${kfold}_lang.lst" @@ -71,11 +80,10 @@ do VECTOR_FILE="${VECTOR_FILES_BEGIN}_${kfold}${VECTOR_FILES_END}" fi - TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst" - VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst" + TRAIN_LST="${MOTHER_LST_DIR}/lst/train_${kfold}.lst" + VAL_LST="${MOTHER_LST_DIR}/lst/val_${kfold}.lst" EXP_DIR="${OUTDIR}/${kfold}" METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst" - METAS_CHARACTER="${DATADIR}/masseffect.lst" METAS_LANG="${NEW_LSTDIR}/metas_${kfold}_lang.lst" diff --git a/run_kfold.sh b/run_kfold.sh new file mode 100755 index 0000000..f436bfe --- /dev/null +++ b/run_kfold.sh @@ -0,0 +1,7 @@ + +for kfold in `seq 1 4` +do + echo "KFOLD: ${kfold}" + source run.sh +done + diff --git a/run_without_kfold.sh b/run_without_kfold.sh new file mode 100644 index 0000000..cdfced9 --- /dev/null +++ b/run_without_kfold.sh @@ -0,0 +1,16 @@ + +for k in $(seq ${KMIN} 1 ${KMAX}) +do + SUB_EXP_DIR="${EXP_DIR}/${k}" + + # -- EXTRACT KMEANS VALUES + echo "Kmeans Measuring and extraction - ${k}" + python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ + "${VECTOR_FILE}" \ + --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" + + python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + "${METAS_CHARACTER}" \ + "${TRAIN_LST}" \ + "${VAL_LST}" \ + --outfile "${SUB_EXP_DIR}/measures.json" \ No newline at end of file diff --git a/steps/extract_cluster_file.sh b/steps/extract_cluster_file.sh new file mode 100755 index 0000000..5a41a31 --- /dev/null +++ b/steps/extract_cluster_file.sh @@ -0,0 +1,24 @@ + +for kfold in `seq 1 4` +do + source $1 + vector_file=${VECTOR_FILE} + echo "kfold: $kfold" + for kmean in `seq 2 100` + do + echo "kmean: $kmean" + exp_dir="${OUTDIR}/${kfold}/${kmean}" + clustering="${exp_dir}/clustering_${kmean}.pkl" + save_loc="${exp_dir}" + saved_txt="${save_loc}/masseffect_clustered.txt" + saved_lst="${save_loc}/masseffect_clustered.lst" + + python3 bin/extract_kmeans.py "${clustering}" \ + "${vector_file}" \ + --outfile "${saved_txt}" + + cat ${saved_txt} | cut -d" " -f1 > ${saved_lst} + + python3 bin/replace-features.py "${ORIGINAL_VECTOR_FILE}" "${saved_txt}" + done +done diff --git a/steps/extract_cluster_file_skyrim.sh b/steps/extract_cluster_file_skyrim.sh new file mode 100755 index 0000000..5894fb1 --- /dev/null +++ b/steps/extract_cluster_file_skyrim.sh @@ -0,0 +1,22 @@ + +source $1 +vector_file=${VECTOR_FILE} +echo "kfold: $kfold" +for kmean in `seq 2 100` +do + echo "kmean: $kmean" + exp_dir="${OUTDIR}/${kmean}" + clustering="${exp_dir}/clustering_${kmean}.pkl" + save_loc="${exp_dir}" + saved_txt="${save_loc}/masseffect_clustered.txt" + saved_lst="${save_loc}/masseffect_clustered.lst" + + python3 bin/extract_kmeans.py "${clustering}" \ + "${vector_file}" \ + --outfile "${saved_txt}" + + cat ${saved_txt} | cut -d" " -f1 > ${saved_lst} + + python3 bin/replace-features.py "${ORIGINAL_VECTOR_FILE}" "${saved_txt}" +done + diff --git a/steps/extract_language_lst.sh b/steps/extract_language_lst.sh new file mode 100755 index 0000000..80f2219 --- /dev/null +++ b/steps/extract_language_lst.sh @@ -0,0 +1,17 @@ +DATADIR="data" +OUTDIR="exp/kmeans_euclidian/ivectors" +NEW_LSTDIR="${OUTDIR}/lst" + +TRAIN_LST=${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst +VAL_LST=${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst +TRAIN_LANG_LST=${NEW_LSTDIR}/train_${kfold}_lang.lst +VAL_LANG_LST=${NEW_LSTDIR}/val_${kfold}_lang.lst +METAS_LANG=${NEW_LSTDIR}/metas_${kfold}_lang.lst + + +awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST} +echo "VAL EXTRACT LANGUAGE INFO DONE" +awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST} +echo "TRAIN EXTRACT LANGUAGE INFO DONE" +cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}" +echo "GLOBAL EXTRACT LANGUAGE INFO DONE" \ No newline at end of file diff --git a/steps/measure_clustering_char.sh b/steps/measure_clustering_char.sh new file mode 100644 index 0000000..cfa79fb --- /dev/null +++ b/steps/measure_clustering_char.sh @@ -0,0 +1,18 @@ + +python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \ + "${lst_dir}/trainval_${kfold}.lst" "${lst_dir}/train_${kfold}.lst" \ + "${lst_dir}/val_${kfold}.lst" \ + --outfile "${output_kfold}/${k}/measures.json" + + +# This script plot the count matrix of the train set +python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + "${lst_dir}/train_${kfold}.lst" \ + "${lst_dir}/train_${kfold}.lst" \ + --outfile "${SUB_EXP_DIR}/train_count_matrix.pdf" + +# This script plot the count matrix of the validation set +python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + "${lst_dir}/val_${kfold}.lst" \ + "${lst_dir}/val_${kfold}.lst" \ + --outfile "${SUB_EXP_DIR}/val_count_matrix.pdf" diff --git a/steps/measure_clustering_lang.sh b/steps/measure_clustering_lang.sh new file mode 100755 index 0000000..04e3eab --- /dev/null +++ b/steps/measure_clustering_lang.sh @@ -0,0 +1,18 @@ + +python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + "${METAS_LANG}" \ + "${TRAIN_LST}" \ + "${VAL_LST}" \ + --outfile "${SUB_EXP_DIR}/measures_lang.json" + +# This script plot the count matrix of the train set +python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + "${METAS_LANG}" \ + "${TRAIN_LST}" \ + --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf" + +# This script plot the count matrix of the validation set +python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ + "${METAS_LANG}" \ + "${VAL_LST}" \ + --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf" \ No newline at end of file diff --git a/steps/measure_clustering_type.sh b/steps/measure_clustering_type.sh new file mode 100644 index 0000000..e649b07 --- /dev/null +++ b/steps/measure_clustering_type.sh @@ -0,0 +1,15 @@ +python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \ + "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \ + "${lst_dir}/val_${kfold}.lst" \ + --outfile "${output_kfold}/${k}/measures_type.json" + +# This script plot the count matrix of the train set +python3 bin/plot-count-matrix.py "${output_kfold}/${k}/clustered_${k}.txt" \ + "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \ + --outfile "${output_kfold}/${k}/train_count_matrix_type.pdf" + +# This script plot the count matrix of the validation set +python3 bin/plot-count-matrix.py "${output_kfold}/${k}/clustered_${k}.txt" \ + "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/val_${kfold}.lst" \ + --outfile "${output_kfold}/${k}/val_count_matrix_type.pdf" + \ No newline at end of file diff --git a/steps/save_clusters_file.sh b/steps/save_clusters_file.sh new file mode 100755 index 0000000..ef82360 --- /dev/null +++ b/steps/save_clusters_file.sh @@ -0,0 +1,22 @@ + +vector_file="data/xvectors.txt" + +for kfold in `seq 1 4` +do + echo "kfold: $kfold" + for kmean in `seq 2 100` + do + echo "kmean: $kmean" + exp_dir="exp/kmeans_euclidian/xvectors/${kfold}/${kmean}" + clustering="${exp_dir}/clustering_${kmean}.pkl" + save_loc="data/xvectors/saved_clustered/" + saved_txt="${save_loc}/masseffect_clustered_xvectors_${kfold}_${kmean}.txt" + saved_lst="${save_loc}/masseffect_clustered_xvectors_${kfold}_${kmean}.lst" + + python3 bin/extract_kmeans.py "${clustering}" \ + "${vector_file}" \ + --outfile "${saved_txt}" + + cat ${saved_txt} | cut -d" " -f1 > ${saved_lst} + done +done diff --git a/utils/extract-labels.sh b/utils/extract-labels.sh new file mode 100755 index 0000000..fa51890 --- /dev/null +++ b/utils/extract-labels.sh @@ -0,0 +1,22 @@ + + +# Number of set +k=4 +kmean=88 + + +# Vector features file +VECTOR_FILE_MASSEFFECT="data/xvectors.txt" + + +# Dirs +EXP_DIR="exp/kmeans_euclidian/xvectors/${k}/${kmean}" +CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl" + + +# Output dirs +OUTFILE_MASSEFFECT="data/xvectors/saved_clustered/masseffect_clustered_xvectors_${k}_${kmean}.txt" + +python3 bin/extract_kmeans.py "${CLUSTERING}" \ + "${VECTOR_FILE_MASSEFFECT}" \ + --outfile "$OUTFILE_MASSEFFECT" diff --git a/utils/rm-unused-files.sh b/utils/rm-unused-files.sh new file mode 100755 index 0000000..86fe846 --- /dev/null +++ b/utils/rm-unused-files.sh @@ -0,0 +1,16 @@ + +if [ $# -eq 1 ] +then + EXP_DIR="$1" +else + echo "Need to have one and only one argument. This argument is the exp directory." + exit 1 +fi + +for kfold in {1..4} +do + for k in {1..100} + do + rm ${EXP_DIR}/$kfold/$k/clustered_$k.txt + done +done diff --git a/utils/transform_exp_to_kd.sh b/utils/transform_exp_to_kd.sh new file mode 100755 index 0000000..9decb9b --- /dev/null +++ b/utils/transform_exp_to_kd.sh @@ -0,0 +1,87 @@ + +# -- DESCRIPTION -- +# +# This script aims to transform data in a shape that is +# usable mainly by knowledge distillation scripts. +# +# Firstly, it extracts clustering labels +# then change features with the given one +# and finally generate a list file. +# +# The pair features files and list file will be usable +# by the knowledge distillation system. +# -------------------- + + +# -- CONFIGURATION -- +# Configuration error +set -e + +# KFOLD config +MIN_KFOLD=1 +MAX_KFOLD=4 + +# KMEAN config +MIN_KMEAN=2 +MAX_KMEAN=100 + +# Vector features file +DATADIR="data" +FEATURES_DIR="${DATADIR}/pv_from_xv" +FEATURES_PREFIX="me_pv_teacher" +FEATURES_SUFFIX=".txt" + +EXP_DIR="exp/kmeans_euclidian/pv_from_xv" +VECTOR_FILE_MASSEFFECT="${DATADIR}/xvectors.txt" +OUTDIR="data/pv_from_xv/saved_clustered" + +# -- CREATE DIRECTORIES +# OUTPUT DIRECTORY +if [ ! -d "${OUTDIR}" ] +then + mkdir -p ${OUTDIR} +fi + + +# -- FUNCTIONS -- +# Definition of the transform function +function transform() { + # Define subdir variable + local SUB_EXP_DIR="${EXP_DIR}/${k}/${kmean}" + + # Define features file variable + local INITIAL_VECTOR_FILE="${FEATURES_DIR}/${FEATURES_PREFIX}_${k}${FEATURES_SUFFIX}" + + # Information of the current process + echo "[KFOLD, KMEAN]: [${k}, ${kmean}]" + + # Define clustering model variable + local CLUSTERING="${SUB_EXP_DIR}/clustering_${kmean}.pkl" + + + # Define output file + local OUTFILE_MASSEFFECT="${OUTDIR}/masseffect_clustered_${k}_${kmean}.txt" + + # Extracting clustering labels + echo "Extracting clustering labels" + python3 bin/extract_kmeans.py "${CLUSTERING}" \ + "${INITIAL_VECTOR_FILE}" \ + --outfile "${OUTFILE_MASSEFFECT}" + + # Changing features + echo "Changing features" + python bin/replace-features.py ${VECTOR_FILE_MASSEFFECT} ${OUTFILE_MASSEFFECT} + + # Extracting list file + cut -d' ' -f1 ${OUTFILE_MASSEFFECT} > "${OUTDIR}/masseffect_clustered_${k}_${kmean}.lst" +} + + +# -- MAIN LOOPS +for k in $(seq ${MIN_KFOLD} ${MAX_KFOLD}) +do + for kmean in $(seq ${MIN_KMEAN} ${MAX_KMEAN}) + do + transform + done +done