From fee5922c3583c647d955c047809e5610ec8d7d63 Mon Sep 17 00:00:00 2001
From: Mathias Quillot <mathiasquillot@gmail.com>
Date: Wed, 24 Jul 2019 23:54:56 +0200
Subject: [PATCH] New way to exec the run file. Now you can run the clustering
 juste for one model, or use the run file and launch for each fold. You can
 config it with configuration files in config.

---
 config/ivector_config.sh |   9 +++
 config/pvector_config.sh |  11 ++++
 config/xvector_config.sh |   9 +++
 run-clustering.sh        |  89 +++++++++++++++++++++++++++
 run.sh                   | 156 ++++++++++++++++++++++++-----------------------
 5 files changed, 198 insertions(+), 76 deletions(-)
 create mode 100644 config/ivector_config.sh
 create mode 100644 config/pvector_config.sh
 create mode 100644 config/xvector_config.sh
 create mode 100755 run-clustering.sh

diff --git a/config/ivector_config.sh b/config/ivector_config.sh
new file mode 100644
index 0000000..883091a
--- /dev/null
+++ b/config/ivector_config.sh
@@ -0,0 +1,9 @@
+OUTDIR="exp/kmeans_euclidian/ivectors"
+DATADIR="data"
+NEW_LSTDIR="${OUTDIR}/lst"
+
+VECTOR_FILE="data/ivectors.txt" # To specify if there's only one
+VECTOR_FILES_ONE=true # Specify there's only one file
+
+KMIN=2
+KMAX=100
diff --git a/config/pvector_config.sh b/config/pvector_config.sh
new file mode 100644
index 0000000..03617b6
--- /dev/null
+++ b/config/pvector_config.sh
@@ -0,0 +1,11 @@
+OUTDIR="exp/kmeans_euclidian/teacher-pvector-1"
+DATADIR="data"
+NEW_LSTDIR="${OUTDIR}/lst"
+
+VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher"
+VECTOR_FILES_END=".txt"
+VECTOR_FILE="" # To specify if there's only one
+VECTOR_FILES_ONE=false # Specify there's only one file
+
+KMIN=2
+KMAX=100
diff --git a/config/xvector_config.sh b/config/xvector_config.sh
new file mode 100644
index 0000000..73a47fd
--- /dev/null
+++ b/config/xvector_config.sh
@@ -0,0 +1,9 @@
+OUTDIR="exp/kmeans_euclidian/xvectors"
+DATADIR="data"
+NEW_LSTDIR="${OUTDIR}/lst"
+
+VECTOR_FILE="data/xvectors.txt" # To specify if there's only one
+VECTOR_FILES_ONE=true # Specify there's only one file
+
+KMIN=2
+KMAX=100
diff --git a/run-clustering.sh b/run-clustering.sh
new file mode 100755
index 0000000..7af7fbd
--- /dev/null
+++ b/run-clustering.sh
@@ -0,0 +1,89 @@
+#
+# This script aims to compute clustering
+# 
+
+
+# -- CONFIGURATION
+# THIS SCRIPT NEEDS THESE VARIABLES
+# Vector file
+#VECTOR_FILE=""
+# Train list
+#TRAIN_LST==""
+# Val list
+#VAL_LST=""
+# Exp directory
+#EXP_DIR=""
+# Metas file with type values
+#METAS_TYPE=""
+# Metas file with character values
+#METAS_CHARACTER=""
+
+
+#echo "VECTOR FILE: $VECTOR_FILE"
+#echo "TRAIN LIST: $TRAIN_LST"
+#echo "VAL LIST: $VAL_LST"
+#echo "EXP DIR: $EXP_DIR"
+#echo "METAS TYPE: $METAS_TYPE"
+#echo "METAS_CHARACTER: $METAS_CHARACTER"
+
+
+
+# -- TRAIN KMEANS 
+echo "Clustering - ${kfold}"
+python3 bin/cluster_kmeans.py "${VECTOR_FILE}" \
+    "${TRAIN_LST}" \
+    "${EXP_DIR}" --kmin ${KMIN} --kmax ${KMAX}
+
+
+
+for k in $(seq ${KMIN} 1 ${KMAX})
+do
+    SUB_EXP_DIR="${EXP_DIR}/${k}"
+    
+    # -- EXTRACT KMEANS VALUES
+    echo "Kmeans Measuring and extraction - ${k}"
+    python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \
+        "${VECTOR_FILE}" \
+        --outfile "${SUB_EXP_DIR}/clustered_${k}.txt"
+    # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR
+    # Measures
+    python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
+        "${METAS_CHARACTER}" \
+        "${TRAIN_LST}" \
+        "${VAL_LST}" \
+        --outfile "${SUB_EXP_DIR}/measures.json"
+    
+    # Plot count matrix for train
+    python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
+        ${VECTOR_FILE} \
+        ${TRAIN_LST} \
+        --outfile "${SUB_EXP_DIR}/train_count_matrix.pdf"
+        
+    # Plot count matrix for val
+    python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
+        ${VECTOR_FILE} \
+        ${VAL_LST} \
+        --outfile "${SUB_EXP_DIR}/val_count_matrix.pdf"
+
+    # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR
+    # Measures
+    python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
+        "${METAS_TYPE}" \
+        "${TRAIN_LST}" \
+        "${VAL_LST}" \
+        --outfile "${SUB_EXP_DIR}/measures_type.json"
+        
+    # This script plot the count matrix of the train set
+    python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
+        "${METAS_TYPE}" \
+        "${TRAIN_LST}" \
+        --outfile "${SUB_EXP_DIR}/train_count_matrix_type.pdf"
+        
+    # This script plot the count matrix of the validation set
+    python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
+        "${METAS_TYPE}" \
+        "${VAL_LST}" \
+        --outfile "${SUB_EXP_DIR}/val_count_matrix_type.pdf"
+
+done
+
diff --git a/run.sh b/run.sh
index 5def9ac..353a9f1 100755
--- a/run.sh
+++ b/run.sh
@@ -1,14 +1,38 @@
-# Pour le moment, le run ne fait qu'executer
-# quelques petites commandes que l'on souhaite
-# tester.
 
-OUTDIR="exp/kmeans_teacher_1/pvector-1"
-DATADIR="data"
-NEW_LSTDIR="${OUTDIR}/lst"
+#OUTDIR="exp/test/pvector-2"
+#DATADIR="data"
+#NEW_LSTDIR="${OUTDIR}/lst"
 
-kmin=2
-kmax=100
+#VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher"
+#VECTOR_FILES_END=".txt"
+#VECTOR_FILE="" # To specify if there's only one
+#VECTOR_FILES_ONE=false # Specify there's only one file
 
+#KMIN=2
+#KMAX=100
+
+# -- LOAD CONFIG FILE
+CONFIG_FILE="config.sh"
+
+if [ $# -eq 1 ]
+then
+    CONFIG_FILE="$1"
+else
+    echo "Need to have one and only one argument"
+    exit -1
+fi
+
+source $CONFIG_FILE
+
+# -- DEFAULTS VALUES CONFIGURATION
+if [ -z "$VECTOR_FILES_ONE" ]
+then
+    VECTOR_FILES_ONE=false
+fi
+
+
+
+# -- MAKE DIRECTORIES
 if [ ! -d "$OUTDIR" ];
 then
     mkdir -p $OUTDIR
@@ -19,82 +43,62 @@ then
     mkdir -p ${NEW_LSTDIR}
 fi
 
-for kfold in 4 #..4}
+
+# -- BEGIN BY KFOLD
+for kfold in {1..4}
 do
-    #echo "kfold = ${kfold}"
-    pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt"
-    lst_dir="${DATADIR}/pvectors_1rst/lst"
-    output_kfold="${OUTDIR}/${kfold}"
+    # Some usefull variable
+    CHAR_INFO="${DATADIR}/character_information.csv"
+    TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst"
+    VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst"
     
-    if [ ! -d "${output_kfold}" ];
+    # Configuration for the run clustering file
+    if [ ! ${VECTOR_FILES_ONE} ]
     then
-        mkdir -p ${output_kfold}
+        VECTOR_FILE="${VECTOR_FILES_BEGIN}_${kfold}${VECTOR_FILES_END}"
     fi
-        
     
+    TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst"
+    VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst"
+    EXP_DIR="${OUTDIR}/${kfold}"
+    METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst" #*
+    METAS_CHARACTER="${DATADIR}/masseffect.lst"
+
+    
+    
+    if [ ! -d "${EXP_DIR}" ];
+    then
+        mkdir -p ${EXP_DIR}
+    fi
+
+
     # Extract character information
     echo "Extracting character information"
     python3 "bin/replace_label.py" \
-        "${DATADIR}/masseffect.lst" \
-        "${DATADIR}/character_information.csv" \
-        --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \
-        --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst"
-    
+        "${METAS_CHARACTER}" \
+        "${CHAR_INFO}" \
+        --field "type" \
+        --lst "${TRAIN_LST}" \
+        --outfile "${TRAIN_TYPE_LST}"
+
     python3 "bin/replace_label.py" \
-        "${DATADIR}/masseffect.lst" \
-        "${DATADIR}/character_information.csv" \
-        --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \
-        --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst"
-    cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst"
-
-    # -- TRAIN KMEANS 
-    echo "Clustering - ${kfold}"
-    python3 bin/cluster_kmeans.py "${pvector_file}" \
-        "${lst_dir}/train_${kfold}.lst" \
-        "${output_kfold}" --kmin ${kmin} --kmax ${kmax}
-
-    for k in $(seq ${kmin} 1 ${kmax})
-    do
-        # -- EXTRACT KMEANS VALUES
-        echo "Kmeans Measuring and extraction - ${k}"
-        python3 bin/extract_kmeans.py "${output_kfold}/${k}/clustering_${k}.pkl" \
-            "${pvector_file}" \
-            --outfile "${output_kfold}/${k}/clustered_${k}.txt"
-        
-        
-        # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR
-        # Measures
-        python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json"
-        
-        # Plot count matrix for train
-        python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
-        ${pvector_file} ${lst_dir}/train_${kfold}.lst \
-        --outfile ${output_kfold}/${k}/train_count_matrix.pdf
-        
-        # Plot count matrix for val
-        python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
-            ${pvector_file} ${lst_dir}/val_${kfold}.lst \
-            --outfile ${output_kfold}/${k}/val_count_matrix.pdf
-
-        # Regroup measures with respect to character var
-        python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/
-
-        # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR
-        # Measures
-        python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures_type.json"
-        
-        # This script plot the count matrix of the train set
-        python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
-        ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \
-        --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf
-        
-        # This script plot the count matrix of the validation set
-        python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
-        ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \
-        --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf
-
-        # Regroup measures with respect to type var 
-        python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/ --suffix "_type" --measurefile "measures_type.j
-    done
+        "${METAS_CHARACTER}" \
+        "${CHAR_INFO}" \
+        --field "type" \
+        --lst "${VAL_LST}" \
+        --outfile "${VAL_TYPE_LST}"
+
+    cat "${TRAIN_TYPE_LST}" "${VAL_TYPE_LST}" > "${METAS_TYPE}"
+
+    source "run-clustering.sh"
 done
 
+# Regroup measures with respect to character classes
+echo "Regrouping measures with respect to character classes"
+python3 "bin/regroup-measures.py" ${OUTDIR}
+
+# Regroup measures with respect to type classes
+echo "Regrouping measures with respect to type classes"
+python3 "bin/regroup-measures.py" ${OUTDIR} --suffix "_type" --measurefile "measures_type.json"
+
+
-- 
1.8.2.3