Commit fee5922c3583c647d955c047809e5610ec8d7d63

Authored by Mathias Quillot
1 parent 151e596e35
Exists in master

New way to exec the run file. Now you can run the clustering juste for one model…

…, or use the run file and launch for each fold. You can config it with configuration files in config.

Showing 5 changed files with 198 additions and 76 deletions Side-by-side Diff

config/ivector_config.sh
  1 +OUTDIR="exp/kmeans_euclidian/ivectors"
  2 +DATADIR="data"
  3 +NEW_LSTDIR="${OUTDIR}/lst"
  4 +
  5 +VECTOR_FILE="data/ivectors.txt" # To specify if there's only one
  6 +VECTOR_FILES_ONE=true # Specify there's only one file
  7 +
  8 +KMIN=2
  9 +KMAX=100
config/pvector_config.sh
  1 +OUTDIR="exp/kmeans_euclidian/teacher-pvector-1"
  2 +DATADIR="data"
  3 +NEW_LSTDIR="${OUTDIR}/lst"
  4 +
  5 +VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher"
  6 +VECTOR_FILES_END=".txt"
  7 +VECTOR_FILE="" # To specify if there's only one
  8 +VECTOR_FILES_ONE=false # Specify there's only one file
  9 +
  10 +KMIN=2
  11 +KMAX=100
config/xvector_config.sh
  1 +OUTDIR="exp/kmeans_euclidian/xvectors"
  2 +DATADIR="data"
  3 +NEW_LSTDIR="${OUTDIR}/lst"
  4 +
  5 +VECTOR_FILE="data/xvectors.txt" # To specify if there's only one
  6 +VECTOR_FILES_ONE=true # Specify there's only one file
  7 +
  8 +KMIN=2
  9 +KMAX=100
  1 +#
  2 +# This script aims to compute clustering
  3 +#
  4 +
  5 +
  6 +# -- CONFIGURATION
  7 +# THIS SCRIPT NEEDS THESE VARIABLES
  8 +# Vector file
  9 +#VECTOR_FILE=""
  10 +# Train list
  11 +#TRAIN_LST==""
  12 +# Val list
  13 +#VAL_LST=""
  14 +# Exp directory
  15 +#EXP_DIR=""
  16 +# Metas file with type values
  17 +#METAS_TYPE=""
  18 +# Metas file with character values
  19 +#METAS_CHARACTER=""
  20 +
  21 +
  22 +#echo "VECTOR FILE: $VECTOR_FILE"
  23 +#echo "TRAIN LIST: $TRAIN_LST"
  24 +#echo "VAL LIST: $VAL_LST"
  25 +#echo "EXP DIR: $EXP_DIR"
  26 +#echo "METAS TYPE: $METAS_TYPE"
  27 +#echo "METAS_CHARACTER: $METAS_CHARACTER"
  28 +
  29 +
  30 +
  31 +# -- TRAIN KMEANS
  32 +echo "Clustering - ${kfold}"
  33 +python3 bin/cluster_kmeans.py "${VECTOR_FILE}" \
  34 + "${TRAIN_LST}" \
  35 + "${EXP_DIR}" --kmin ${KMIN} --kmax ${KMAX}
  36 +
  37 +
  38 +
  39 +for k in $(seq ${KMIN} 1 ${KMAX})
  40 +do
  41 + SUB_EXP_DIR="${EXP_DIR}/${k}"
  42 +
  43 + # -- EXTRACT KMEANS VALUES
  44 + echo "Kmeans Measuring and extraction - ${k}"
  45 + python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \
  46 + "${VECTOR_FILE}" \
  47 + --outfile "${SUB_EXP_DIR}/clustered_${k}.txt"
  48 + # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR
  49 + # Measures
  50 + python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
  51 + "${METAS_CHARACTER}" \
  52 + "${TRAIN_LST}" \
  53 + "${VAL_LST}" \
  54 + --outfile "${SUB_EXP_DIR}/measures.json"
  55 +
  56 + # Plot count matrix for train
  57 + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
  58 + ${VECTOR_FILE} \
  59 + ${TRAIN_LST} \
  60 + --outfile "${SUB_EXP_DIR}/train_count_matrix.pdf"
  61 +
  62 + # Plot count matrix for val
  63 + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
  64 + ${VECTOR_FILE} \
  65 + ${VAL_LST} \
  66 + --outfile "${SUB_EXP_DIR}/val_count_matrix.pdf"
  67 +
  68 + # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR
  69 + # Measures
  70 + python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
  71 + "${METAS_TYPE}" \
  72 + "${TRAIN_LST}" \
  73 + "${VAL_LST}" \
  74 + --outfile "${SUB_EXP_DIR}/measures_type.json"
  75 +
  76 + # This script plot the count matrix of the train set
  77 + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
  78 + "${METAS_TYPE}" \
  79 + "${TRAIN_LST}" \
  80 + --outfile "${SUB_EXP_DIR}/train_count_matrix_type.pdf"
  81 +
  82 + # This script plot the count matrix of the validation set
  83 + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
  84 + "${METAS_TYPE}" \
  85 + "${VAL_LST}" \
  86 + --outfile "${SUB_EXP_DIR}/val_count_matrix_type.pdf"
  87 +
  88 +done
1   -# Pour le moment, le run ne fait qu'executer
2   -# quelques petites commandes que l'on souhaite
3   -# tester.
4 1  
5   -OUTDIR="exp/kmeans_teacher_1/pvector-1"
6   -DATADIR="data"
7   -NEW_LSTDIR="${OUTDIR}/lst"
  2 +#OUTDIR="exp/test/pvector-2"
  3 +#DATADIR="data"
  4 +#NEW_LSTDIR="${OUTDIR}/lst"
8 5  
9   -kmin=2
10   -kmax=100
  6 +#VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher"
  7 +#VECTOR_FILES_END=".txt"
  8 +#VECTOR_FILE="" # To specify if there's only one
  9 +#VECTOR_FILES_ONE=false # Specify there's only one file
11 10  
  11 +#KMIN=2
  12 +#KMAX=100
  13 +
  14 +# -- LOAD CONFIG FILE
  15 +CONFIG_FILE="config.sh"
  16 +
  17 +if [ $# -eq 1 ]
  18 +then
  19 + CONFIG_FILE="$1"
  20 +else
  21 + echo "Need to have one and only one argument"
  22 + exit -1
  23 +fi
  24 +
  25 +source $CONFIG_FILE
  26 +
  27 +# -- DEFAULTS VALUES CONFIGURATION
  28 +if [ -z "$VECTOR_FILES_ONE" ]
  29 +then
  30 + VECTOR_FILES_ONE=false
  31 +fi
  32 +
  33 +
  34 +
  35 +# -- MAKE DIRECTORIES
12 36 if [ ! -d "$OUTDIR" ];
13 37 then
14 38 mkdir -p $OUTDIR
15 39  
16 40  
17 41  
18 42  
19 43  
20 44  
21 45  
22 46  
23 47  
24 48  
25 49  
... ... @@ -19,82 +43,61 @@
19 43 mkdir -p ${NEW_LSTDIR}
20 44 fi
21 45  
22   -for kfold in 4 #..4}
  46 +
  47 +# -- BEGIN BY KFOLD
  48 +for kfold in {1..4}
23 49 do
24   - #echo "kfold = ${kfold}"
25   - pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt"
26   - lst_dir="${DATADIR}/pvectors_1rst/lst"
27   - output_kfold="${OUTDIR}/${kfold}"
  50 + # Some usefull variable
  51 + CHAR_INFO="${DATADIR}/character_information.csv"
  52 + TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst"
  53 + VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst"
28 54  
29   - if [ ! -d "${output_kfold}" ];
  55 + # Configuration for the run clustering file
  56 + if [ ! ${VECTOR_FILES_ONE} ]
30 57 then
31   - mkdir -p ${output_kfold}
  58 + VECTOR_FILE="${VECTOR_FILES_BEGIN}_${kfold}${VECTOR_FILES_END}"
32 59 fi
33   -
34 60  
  61 + TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst"
  62 + VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst"
  63 + EXP_DIR="${OUTDIR}/${kfold}"
  64 + METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst" #*
  65 + METAS_CHARACTER="${DATADIR}/masseffect.lst"
  66 +
  67 +
  68 +
  69 + if [ ! -d "${EXP_DIR}" ];
  70 + then
  71 + mkdir -p ${EXP_DIR}
  72 + fi
  73 +
  74 +
35 75 # Extract character information
36 76 echo "Extracting character information"
37 77 python3 "bin/replace_label.py" \
38   - "${DATADIR}/masseffect.lst" \
39   - "${DATADIR}/character_information.csv" \
40   - --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \
41   - --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst"
42   -
  78 + "${METAS_CHARACTER}" \
  79 + "${CHAR_INFO}" \
  80 + --field "type" \
  81 + --lst "${TRAIN_LST}" \
  82 + --outfile "${TRAIN_TYPE_LST}"
  83 +
43 84 python3 "bin/replace_label.py" \
44   - "${DATADIR}/masseffect.lst" \
45   - "${DATADIR}/character_information.csv" \
46   - --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \
47   - --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst"
48   - cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst"
  85 + "${METAS_CHARACTER}" \
  86 + "${CHAR_INFO}" \
  87 + --field "type" \
  88 + --lst "${VAL_LST}" \
  89 + --outfile "${VAL_TYPE_LST}"
49 90  
50   - # -- TRAIN KMEANS
51   - echo "Clustering - ${kfold}"
52   - python3 bin/cluster_kmeans.py "${pvector_file}" \
53   - "${lst_dir}/train_${kfold}.lst" \
54   - "${output_kfold}" --kmin ${kmin} --kmax ${kmax}
  91 + cat "${TRAIN_TYPE_LST}" "${VAL_TYPE_LST}" > "${METAS_TYPE}"
55 92  
56   - for k in $(seq ${kmin} 1 ${kmax})
57   - do
58   - # -- EXTRACT KMEANS VALUES
59   - echo "Kmeans Measuring and extraction - ${k}"
60   - python3 bin/extract_kmeans.py "${output_kfold}/${k}/clustering_${k}.pkl" \
61   - "${pvector_file}" \
62   - --outfile "${output_kfold}/${k}/clustered_${k}.txt"
63   -
64   -
65   - # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR
66   - # Measures
67   - python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json"
68   -
69   - # Plot count matrix for train
70   - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
71   - ${pvector_file} ${lst_dir}/train_${kfold}.lst \
72   - --outfile ${output_kfold}/${k}/train_count_matrix.pdf
73   -
74   - # Plot count matrix for val
75   - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
76   - ${pvector_file} ${lst_dir}/val_${kfold}.lst \
77   - --outfile ${output_kfold}/${k}/val_count_matrix.pdf
  93 + source "run-clustering.sh"
  94 +done
78 95  
79   - # Regroup measures with respect to character var
80   - python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/
  96 +# Regroup measures with respect to character classes
  97 +echo "Regrouping measures with respect to character classes"
  98 +python3 "bin/regroup-measures.py" ${OUTDIR}
81 99  
82   - # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR
83   - # Measures
84   - python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures_type.json"
85   -
86   - # This script plot the count matrix of the train set
87   - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
88   - ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \
89   - --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf
90   -
91   - # This script plot the count matrix of the validation set
92   - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
93   - ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \
94   - --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf
95   -
96   - # Regroup measures with respect to type var
97   - python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/ --suffix "_type" --measurefile "measures_type.j
98   - done
99   -done
  100 +# Regroup measures with respect to type classes
  101 +echo "Regrouping measures with respect to type classes"
  102 +python3 "bin/regroup-measures.py" ${OUTDIR} --suffix "_type" --measurefile "measures_type.json"