From 95142dfdc54218f17529b6757ed7f310811b9534 Mon Sep 17 00:00:00 2001
From: Mathias Quillot <mathiasquillot@gmail.com>
Date: Tue, 29 Oct 2019 10:19:25 +0100
Subject: [PATCH] maj. No comment

---
 bin/replace_label_lst.py    |  5 +++
 config/pv_from_xv_config.sh | 13 ++++++++
 config/pvector_config.sh    |  1 +
 extract-labels.sh           | 10 +++---
 run-clustering.sh           | 23 +++++++++++++-
 run-measures.sh             | 76 ++++++++++++++++++++++++++++++++++-----------
 run.sh                      | 36 ++++++++++++++++++---
 7 files changed, 135 insertions(+), 29 deletions(-)
 create mode 100644 bin/replace_label_lst.py
 create mode 100644 config/pv_from_xv_config.sh

diff --git a/bin/replace_label_lst.py b/bin/replace_label_lst.py
new file mode 100644
index 0000000..47db163
--- /dev/null
+++ b/bin/replace_label_lst.py
@@ -0,0 +1,5 @@
+
+import argparse
+
+parser = argparse.ArgumentParser(description="extract label from lst file, move a label in fact")
+
diff --git a/config/pv_from_xv_config.sh b/config/pv_from_xv_config.sh
new file mode 100644
index 0000000..fda429e
--- /dev/null
+++ b/config/pv_from_xv_config.sh
@@ -0,0 +1,13 @@
+
+# Framework configuration
+OUTDIR="exp/kmeans_euclidian/pv_from_xv"
+DATADIR="data"
+NEW_LSTDIR="${OUTDIR}/lst"
+
+VECTOR_FILES_BEGIN="${DATADIR}/pv_from_xv/me_pv_teacher"
+VECTOR_FILES_END=".txt"
+VECTOR_FILE="" # To specify if there's only one
+VECTOR_FILES_ONE=false # Specify there's only one file
+
+KMIN=2
+KMAX=100
diff --git a/config/pvector_config.sh b/config/pvector_config.sh
index 03617b6..e75dae1 100644
--- a/config/pvector_config.sh
+++ b/config/pvector_config.sh
@@ -1,3 +1,4 @@
+
 OUTDIR="exp/kmeans_euclidian/teacher-pvector-1"
 DATADIR="data"
 NEW_LSTDIR="${OUTDIR}/lst"
diff --git a/extract-labels.sh b/extract-labels.sh
index 34ea1f4..fa51890 100755
--- a/extract-labels.sh
+++ b/extract-labels.sh
@@ -2,20 +2,20 @@
 
 # Number of set
 k=4
+kmean=88
+
 
 # Vector features file
-VECTOR_FILE_MASSEFFECT="data/pvectors_1rst/pvectors_teacher_${k}.txt"
+VECTOR_FILE_MASSEFFECT="data/xvectors.txt"
 
-# Number of clusters
-kmean=6
 
 # Dirs
-EXP_DIR="exp/kmeans_euclidian/teacher-pvector-1/${k}/${kmean}"
+EXP_DIR="exp/kmeans_euclidian/xvectors/${k}/${kmean}"
 CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl"
 
 
 # Output dirs
-OUTFILE_MASSEFFECT="data/pvectors_1rst/saved_clustered/masseffect_clustered_${k}_${kmean}.txt"
+OUTFILE_MASSEFFECT="data/xvectors/saved_clustered/masseffect_clustered_xvectors_${k}_${kmean}.txt"
 
 python3 bin/extract_kmeans.py "${CLUSTERING}" \
         "${VECTOR_FILE_MASSEFFECT}" \
diff --git a/run-clustering.sh b/run-clustering.sh
index 7af7fbd..a149c02 100755
--- a/run-clustering.sh
+++ b/run-clustering.sh
@@ -84,6 +84,27 @@ do
         "${METAS_TYPE}" \
         "${VAL_LST}" \
         --outfile "${SUB_EXP_DIR}/val_count_matrix_type.pdf"
-
+   
+ 
+    # -- MEASURES AND PLOT WITH RESPECT TO LANG VAR
+    # Measures
+    python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
+        "${METAS_LANG}" \
+        "${TRAIN_LST}" \
+        "${VAL_LST}" \
+        --outfile "${SUB_EXP_DIR}/measures_lang.json"
+        
+    # This script plot the count matrix of the train set
+    python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
+        "${METAS_LANG}" \
+        "${TRAIN_LST}" \
+        --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf"
+        
+    # This script plot the count matrix of the validation set
+    python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
+        "${METAS_LANG}" \
+        "${VAL_LST}" \
+        --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf"
+    
 done
 
diff --git a/run-measures.sh b/run-measures.sh
index b2dc722..a328ced 100755
--- a/run-measures.sh
+++ b/run-measures.sh
@@ -2,7 +2,8 @@
 # quelques petites commandes que l'on souhaite
 # tester.
 
-OUTDIR="exp/kmeans_teacher_1/pvector-1"
+OUTDIR="exp/kmeans_euclidian/teacher-pvector-1"
+EXP_DIR=${OUTDIR}
 DATADIR="data"
 NEW_LSTDIR="${OUTDIR}/lst"
 
@@ -22,6 +23,7 @@ fi
 for kfold in {1..4}
 do
     pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt"
+    VECTOR_FILE=$pvector_file
     lst_dir="${DATADIR}/pvectors_1rst/lst"
     output_kfold="${OUTDIR}/${kfold}"
     
@@ -38,41 +40,79 @@ do
     #    --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst"
    
     #cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst"
+    TRAIN_LST=${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst
+    VAL_LST=${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst
+    TRAIN_LANG_LST=${NEW_LSTDIR}/train_${kfold}_lang.lst
+    VAL_LANG_LST=${NEW_LSTDIR}/val_${kfold}_lang.lst
+    METAS_LANG=${NEW_LSTDIR}/metas_${kfold}_lang.lst
+     
+    # EXTRACT LANGUAGE INFORMATION
+    awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST}
+    echo "VAL EXTRACT LANGUAGE INFO DONE"
+    awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST}
+    echo "TRAIN EXTRACT LANGUAGE INFO DONE"
+    cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}"
+    echo "GLOBAL EXTRACT LANGUAGE INFO DONE"
+  
 
- 
     echo "Clustering - ${kfold}"
 
     for k in $(seq ${kmin} 1 ${kmax})
     do
         echo "Kmeans Measuring and ploting - ${k}"
-	
-	    # This script compute measures from clustering   
-        #python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json"
-        
+
+	SUB_EXP_DIR="${EXP_DIR}/${kfold}/${k}"
+
+	# -- EXTRACT CLUSTERING LABELS
+	python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \
+        "${VECTOR_FILE}" \
+        --outfile "${SUB_EXP_DIR}/clustered_${k}.txt"
+
+	# -- MEASURES AND PLOT WITH RESPECT TO LANG VAR
+        # Measures
+        python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
+            "${METAS_LANG}" \
+            "${TRAIN_LST}" \
+            "${VAL_LST}" \
+            --outfile "${SUB_EXP_DIR}/measures_lang.json"
+
+        # This script plot the count matrix of the train set
+        python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
+            "${METAS_LANG}" \
+            "${TRAIN_LST}" \
+            --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf"
+
+        # This script plot the count matrix of the validation set
+        python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
+            "${METAS_LANG}" \
+            "${VAL_LST}" \
+             --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf"
+
+	rm ${SUB_EXP_DIR}/clustered_${k}.txt
         #python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \
         #    "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \
         #    "${lst_dir}/val_${kfold}.lst" \
         #    --outfile "${output_kfold}/${k}/measures_type.json"
         
         # This script plot the count matrix of the train set
-        python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
-            ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \
-            --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf
+        #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
+        #    ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \
+        #    --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf
         
         # This script plot the count matrix of the validation set
-        python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
-            ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \
-            --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf
+        #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
+        #    ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \
+        #    --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf
         
         # This script plot the count matrix of the train set
-        python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
-            ${pvector_file} ${lst_dir}/train_${kfold}.lst \
-            --outfile ${output_kfold}/${k}/train_count_matrix.pdf
+        #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
+        #    ${pvector_file} ${lst_dir}/train_${kfold}.lst \
+        #    --outfile ${output_kfold}/${k}/train_count_matrix.pdf
         
         # This script plot the count matrix of the validation set
-        python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
-            ${pvector_file} ${lst_dir}/val_${kfold}.lst \
-            --outfile ${output_kfold}/${k}/val_count_matrix.pdf
+        #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
+        #    ${pvector_file} ${lst_dir}/val_${kfold}.lst \
+        #    --outfile ${output_kfold}/${k}/val_count_matrix.pdf
     done
 done
 
diff --git a/run.sh b/run.sh
index 4b9b39a..310f58b 100755
--- a/run.sh
+++ b/run.sh
@@ -44,13 +44,26 @@ then
 fi
 
 
+# -- KFOLD MIN and MAX
+if [ -z "$MIN_KFOLD" ]
+then
+    MIN_KFOLD=1
+fi
+
+if [ -z "$MAX_KFOLD" ]
+then
+    MAX_KFOLD=4
+fi
+
 # -- BEGIN BY KFOLD
-for kfold in {1..4}
+for kfold in $(seq ${MIN_KFOLD} ${MAX_KFOLD})
 do
     # Some usefull variable
     CHAR_INFO="${DATADIR}/character_information.csv"
     TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst"
     VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst"
+    TRAIN_LANG_LST="${NEW_LSTDIR}/train_${kfold}_lang.lst"
+    VAL_LANG_LST="${NEW_LSTDIR}/val_${kfold}_lang.lst"
     
     # Configuration for the run clustering file
     if [ ${VECTOR_FILES_ONE} == false ]
@@ -61,9 +74,9 @@ do
     TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst"
     VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst"
     EXP_DIR="${OUTDIR}/${kfold}"
-    METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst" #*
+    METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst"
     METAS_CHARACTER="${DATADIR}/masseffect.lst"
-
+    METAS_LANG="${NEW_LSTDIR}/metas_${kfold}_lang.lst"
     
     
     if [ ! -d "${EXP_DIR}" ];
@@ -72,8 +85,9 @@ do
     fi
 
 
-    # Extract character information
+    # EXTRACT TYPE INFORMATION
     echo "Extracting character information"
+    echo "Replace in train"
     python3 "bin/replace_label.py" \
         "${METAS_CHARACTER}" \
         "${CHAR_INFO}" \
@@ -81,6 +95,7 @@ do
         --lst "${TRAIN_LST}" \
         --outfile "${TRAIN_TYPE_LST}"
 
+    echo "Replace in val"
     python3 "bin/replace_label.py" \
         "${METAS_CHARACTER}" \
         "${CHAR_INFO}" \
@@ -88,8 +103,19 @@ do
         --lst "${VAL_LST}" \
         --outfile "${VAL_TYPE_LST}"
 
+    echo "Merge them"
     cat "${TRAIN_TYPE_LST}" "${VAL_TYPE_LST}" > "${METAS_TYPE}"
-
+    
+    # EXTRACT LANGUAGE INFORMATION
+    echo "Language info for train"
+    awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST}
+    echo "Language info for val"
+    awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST}   
+    
+    echo "Merge them"
+    cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}"
+    
+    echo "Then Run Clustering"
     source "run-clustering.sh"
 done
 
-- 
1.8.2.3