Commit 95142dfdc54218f17529b6757ed7f310811b9534

Authored by Mathias Quillot
1 parent 0ab563604a
Exists in master

maj. No comment

Showing 7 changed files with 135 additions and 29 deletions Side-by-side Diff

bin/replace_label_lst.py
  1 +
  2 +import argparse
  3 +
  4 +parser = argparse.ArgumentParser(description="extract label from lst file, move a label in fact")
config/pv_from_xv_config.sh
  1 +
  2 +# Framework configuration
  3 +OUTDIR="exp/kmeans_euclidian/pv_from_xv"
  4 +DATADIR="data"
  5 +NEW_LSTDIR="${OUTDIR}/lst"
  6 +
  7 +VECTOR_FILES_BEGIN="${DATADIR}/pv_from_xv/me_pv_teacher"
  8 +VECTOR_FILES_END=".txt"
  9 +VECTOR_FILE="" # To specify if there's only one
  10 +VECTOR_FILES_ONE=false # Specify there's only one file
  11 +
  12 +KMIN=2
  13 +KMAX=100
config/pvector_config.sh
  1 +
1 2 OUTDIR="exp/kmeans_euclidian/teacher-pvector-1"
2 3 DATADIR="data"
3 4 NEW_LSTDIR="${OUTDIR}/lst"
... ... @@ -2,20 +2,20 @@
2 2  
3 3 # Number of set
4 4 k=4
  5 +kmean=88
5 6  
  7 +
6 8 # Vector features file
7   -VECTOR_FILE_MASSEFFECT="data/pvectors_1rst/pvectors_teacher_${k}.txt"
  9 +VECTOR_FILE_MASSEFFECT="data/xvectors.txt"
8 10  
9   -# Number of clusters
10   -kmean=6
11 11  
12 12 # Dirs
13   -EXP_DIR="exp/kmeans_euclidian/teacher-pvector-1/${k}/${kmean}"
  13 +EXP_DIR="exp/kmeans_euclidian/xvectors/${k}/${kmean}"
14 14 CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl"
15 15  
16 16  
17 17 # Output dirs
18   -OUTFILE_MASSEFFECT="data/pvectors_1rst/saved_clustered/masseffect_clustered_${k}_${kmean}.txt"
  18 +OUTFILE_MASSEFFECT="data/xvectors/saved_clustered/masseffect_clustered_xvectors_${k}_${kmean}.txt"
19 19  
20 20 python3 bin/extract_kmeans.py "${CLUSTERING}" \
21 21 "${VECTOR_FILE_MASSEFFECT}" \
... ... @@ -84,6 +84,27 @@
84 84 "${METAS_TYPE}" \
85 85 "${VAL_LST}" \
86 86 --outfile "${SUB_EXP_DIR}/val_count_matrix_type.pdf"
87   -
  87 +
  88 +
  89 + # -- MEASURES AND PLOT WITH RESPECT TO LANG VAR
  90 + # Measures
  91 + python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
  92 + "${METAS_LANG}" \
  93 + "${TRAIN_LST}" \
  94 + "${VAL_LST}" \
  95 + --outfile "${SUB_EXP_DIR}/measures_lang.json"
  96 +
  97 + # This script plot the count matrix of the train set
  98 + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
  99 + "${METAS_LANG}" \
  100 + "${TRAIN_LST}" \
  101 + --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf"
  102 +
  103 + # This script plot the count matrix of the validation set
  104 + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
  105 + "${METAS_LANG}" \
  106 + "${VAL_LST}" \
  107 + --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf"
  108 +
88 109 done
... ... @@ -2,7 +2,8 @@
2 2 # quelques petites commandes que l'on souhaite
3 3 # tester.
4 4  
5   -OUTDIR="exp/kmeans_teacher_1/pvector-1"
  5 +OUTDIR="exp/kmeans_euclidian/teacher-pvector-1"
  6 +EXP_DIR=${OUTDIR}
6 7 DATADIR="data"
7 8 NEW_LSTDIR="${OUTDIR}/lst"
8 9  
... ... @@ -22,6 +23,7 @@
22 23 for kfold in {1..4}
23 24 do
24 25 pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt"
  26 + VECTOR_FILE=$pvector_file
25 27 lst_dir="${DATADIR}/pvectors_1rst/lst"
26 28 output_kfold="${OUTDIR}/${kfold}"
27 29  
28 30  
29 31  
30 32  
31 33  
32 34  
33 35  
... ... @@ -38,41 +40,79 @@
38 40 # --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst"
39 41  
40 42 #cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst"
  43 + TRAIN_LST=${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst
  44 + VAL_LST=${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst
  45 + TRAIN_LANG_LST=${NEW_LSTDIR}/train_${kfold}_lang.lst
  46 + VAL_LANG_LST=${NEW_LSTDIR}/val_${kfold}_lang.lst
  47 + METAS_LANG=${NEW_LSTDIR}/metas_${kfold}_lang.lst
  48 +
  49 + # EXTRACT LANGUAGE INFORMATION
  50 + awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST}
  51 + echo "VAL EXTRACT LANGUAGE INFO DONE"
  52 + awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST}
  53 + echo "TRAIN EXTRACT LANGUAGE INFO DONE"
  54 + cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}"
  55 + echo "GLOBAL EXTRACT LANGUAGE INFO DONE"
  56 +
41 57  
42   -
43 58 echo "Clustering - ${kfold}"
44 59  
45 60 for k in $(seq ${kmin} 1 ${kmax})
46 61 do
47 62 echo "Kmeans Measuring and ploting - ${k}"
48   -
49   - # This script compute measures from clustering
50   - #python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json"
51   -
  63 +
  64 + SUB_EXP_DIR="${EXP_DIR}/${kfold}/${k}"
  65 +
  66 + # -- EXTRACT CLUSTERING LABELS
  67 + python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \
  68 + "${VECTOR_FILE}" \
  69 + --outfile "${SUB_EXP_DIR}/clustered_${k}.txt"
  70 +
  71 + # -- MEASURES AND PLOT WITH RESPECT TO LANG VAR
  72 + # Measures
  73 + python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
  74 + "${METAS_LANG}" \
  75 + "${TRAIN_LST}" \
  76 + "${VAL_LST}" \
  77 + --outfile "${SUB_EXP_DIR}/measures_lang.json"
  78 +
  79 + # This script plot the count matrix of the train set
  80 + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
  81 + "${METAS_LANG}" \
  82 + "${TRAIN_LST}" \
  83 + --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf"
  84 +
  85 + # This script plot the count matrix of the validation set
  86 + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
  87 + "${METAS_LANG}" \
  88 + "${VAL_LST}" \
  89 + --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf"
  90 +
  91 + rm ${SUB_EXP_DIR}/clustered_${k}.txt
52 92 #python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \
53 93 # "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \
54 94 # "${lst_dir}/val_${kfold}.lst" \
55 95 # --outfile "${output_kfold}/${k}/measures_type.json"
56 96  
57 97 # This script plot the count matrix of the train set
58   - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
59   - ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \
60   - --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf
  98 + #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
  99 + # ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \
  100 + # --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf
61 101  
62 102 # This script plot the count matrix of the validation set
63   - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
64   - ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \
65   - --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf
  103 + #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
  104 + # ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \
  105 + # --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf
66 106  
67 107 # This script plot the count matrix of the train set
68   - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
69   - ${pvector_file} ${lst_dir}/train_${kfold}.lst \
70   - --outfile ${output_kfold}/${k}/train_count_matrix.pdf
  108 + #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
  109 + # ${pvector_file} ${lst_dir}/train_${kfold}.lst \
  110 + # --outfile ${output_kfold}/${k}/train_count_matrix.pdf
71 111  
72 112 # This script plot the count matrix of the validation set
73   - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
74   - ${pvector_file} ${lst_dir}/val_${kfold}.lst \
75   - --outfile ${output_kfold}/${k}/val_count_matrix.pdf
  113 + #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
  114 + # ${pvector_file} ${lst_dir}/val_${kfold}.lst \
  115 + # --outfile ${output_kfold}/${k}/val_count_matrix.pdf
76 116 done
77 117 done
... ... @@ -44,13 +44,26 @@
44 44 fi
45 45  
46 46  
  47 +# -- KFOLD MIN and MAX
  48 +if [ -z "$MIN_KFOLD" ]
  49 +then
  50 + MIN_KFOLD=1
  51 +fi
  52 +
  53 +if [ -z "$MAX_KFOLD" ]
  54 +then
  55 + MAX_KFOLD=4
  56 +fi
  57 +
47 58 # -- BEGIN BY KFOLD
48   -for kfold in {1..4}
  59 +for kfold in $(seq ${MIN_KFOLD} ${MAX_KFOLD})
49 60 do
50 61 # Some usefull variable
51 62 CHAR_INFO="${DATADIR}/character_information.csv"
52 63 TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst"
53 64 VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst"
  65 + TRAIN_LANG_LST="${NEW_LSTDIR}/train_${kfold}_lang.lst"
  66 + VAL_LANG_LST="${NEW_LSTDIR}/val_${kfold}_lang.lst"
54 67  
55 68 # Configuration for the run clustering file
56 69 if [ ${VECTOR_FILES_ONE} == false ]
57 70  
... ... @@ -61,9 +74,9 @@
61 74 TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst"
62 75 VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst"
63 76 EXP_DIR="${OUTDIR}/${kfold}"
64   - METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst" #*
  77 + METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst"
65 78 METAS_CHARACTER="${DATADIR}/masseffect.lst"
66   -
  79 + METAS_LANG="${NEW_LSTDIR}/metas_${kfold}_lang.lst"
67 80  
68 81  
69 82 if [ ! -d "${EXP_DIR}" ];
70 83  
... ... @@ -72,8 +85,9 @@
72 85 fi
73 86  
74 87  
75   - # Extract character information
  88 + # EXTRACT TYPE INFORMATION
76 89 echo "Extracting character information"
  90 + echo "Replace in train"
77 91 python3 "bin/replace_label.py" \
78 92 "${METAS_CHARACTER}" \
79 93 "${CHAR_INFO}" \
... ... @@ -81,6 +95,7 @@
81 95 --lst "${TRAIN_LST}" \
82 96 --outfile "${TRAIN_TYPE_LST}"
83 97  
  98 + echo "Replace in val"
84 99 python3 "bin/replace_label.py" \
85 100 "${METAS_CHARACTER}" \
86 101 "${CHAR_INFO}" \
87 102  
... ... @@ -88,8 +103,19 @@
88 103 --lst "${VAL_LST}" \
89 104 --outfile "${VAL_TYPE_LST}"
90 105  
  106 + echo "Merge them"
91 107 cat "${TRAIN_TYPE_LST}" "${VAL_TYPE_LST}" > "${METAS_TYPE}"
92   -
  108 +
  109 + # EXTRACT LANGUAGE INFORMATION
  110 + echo "Language info for train"
  111 + awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST}
  112 + echo "Language info for val"
  113 + awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST}
  114 +
  115 + echo "Merge them"
  116 + cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}"
  117 +
  118 + echo "Then Run Clustering"
93 119 source "run-clustering.sh"
94 120 done
95 121