Commit 95142dfdc54218f17529b6757ed7f310811b9534
1 parent
0ab563604a
Exists in
master
maj. No comment
Showing 7 changed files with 135 additions and 29 deletions Inline Diff
bin/replace_label_lst.py
File was created | 1 | ||
2 | import argparse | ||
3 | |||
4 | parser = argparse.ArgumentParser(description="extract label from lst file, move a label in fact") | ||
5 | |||
6 |
config/pv_from_xv_config.sh
File was created | 1 | ||
2 | # Framework configuration | ||
3 | OUTDIR="exp/kmeans_euclidian/pv_from_xv" | ||
4 | DATADIR="data" | ||
5 | NEW_LSTDIR="${OUTDIR}/lst" | ||
6 | |||
7 | VECTOR_FILES_BEGIN="${DATADIR}/pv_from_xv/me_pv_teacher" | ||
8 | VECTOR_FILES_END=".txt" | ||
9 | VECTOR_FILE="" # To specify if there's only one | ||
10 | VECTOR_FILES_ONE=false # Specify there's only one file | ||
11 | |||
12 | KMIN=2 | ||
13 | KMAX=100 | ||
14 |
config/pvector_config.sh
1 | |||
1 | OUTDIR="exp/kmeans_euclidian/teacher-pvector-1" | 2 | OUTDIR="exp/kmeans_euclidian/teacher-pvector-1" |
2 | DATADIR="data" | 3 | DATADIR="data" |
3 | NEW_LSTDIR="${OUTDIR}/lst" | 4 | NEW_LSTDIR="${OUTDIR}/lst" |
4 | 5 | ||
5 | VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" | 6 | VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" |
6 | VECTOR_FILES_END=".txt" | 7 | VECTOR_FILES_END=".txt" |
7 | VECTOR_FILE="" # To specify if there's only one | 8 | VECTOR_FILE="" # To specify if there's only one |
8 | VECTOR_FILES_ONE=false # Specify there's only one file | 9 | VECTOR_FILES_ONE=false # Specify there's only one file |
9 | 10 | ||
10 | KMIN=2 | 11 | KMIN=2 |
11 | KMAX=100 | 12 | KMAX=100 |
12 | 13 |
extract-labels.sh
1 | 1 | ||
2 | 2 | ||
3 | # Number of set | 3 | # Number of set |
4 | k=4 | 4 | k=4 |
5 | kmean=88 | ||
5 | 6 | ||
7 | |||
6 | # Vector features file | 8 | # Vector features file |
7 | VECTOR_FILE_MASSEFFECT="data/pvectors_1rst/pvectors_teacher_${k}.txt" | 9 | VECTOR_FILE_MASSEFFECT="data/xvectors.txt" |
8 | 10 | ||
9 | # Number of clusters | ||
10 | kmean=6 | ||
11 | 11 | ||
12 | # Dirs | 12 | # Dirs |
13 | EXP_DIR="exp/kmeans_euclidian/teacher-pvector-1/${k}/${kmean}" | 13 | EXP_DIR="exp/kmeans_euclidian/xvectors/${k}/${kmean}" |
14 | CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl" | 14 | CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl" |
15 | 15 | ||
16 | 16 | ||
17 | # Output dirs | 17 | # Output dirs |
18 | OUTFILE_MASSEFFECT="data/pvectors_1rst/saved_clustered/masseffect_clustered_${k}_${kmean}.txt" | 18 | OUTFILE_MASSEFFECT="data/xvectors/saved_clustered/masseffect_clustered_xvectors_${k}_${kmean}.txt" |
19 | 19 | ||
20 | python3 bin/extract_kmeans.py "${CLUSTERING}" \ | 20 | python3 bin/extract_kmeans.py "${CLUSTERING}" \ |
21 | "${VECTOR_FILE_MASSEFFECT}" \ | 21 | "${VECTOR_FILE_MASSEFFECT}" \ |
run-clustering.sh
1 | # | 1 | # |
2 | # This script aims to compute clustering | 2 | # This script aims to compute clustering |
3 | # | 3 | # |
4 | 4 | ||
5 | 5 | ||
6 | # -- CONFIGURATION | 6 | # -- CONFIGURATION |
7 | # THIS SCRIPT NEEDS THESE VARIABLES | 7 | # THIS SCRIPT NEEDS THESE VARIABLES |
8 | # Vector file | 8 | # Vector file |
9 | #VECTOR_FILE="" | 9 | #VECTOR_FILE="" |
10 | # Train list | 10 | # Train list |
11 | #TRAIN_LST=="" | 11 | #TRAIN_LST=="" |
12 | # Val list | 12 | # Val list |
13 | #VAL_LST="" | 13 | #VAL_LST="" |
14 | # Exp directory | 14 | # Exp directory |
15 | #EXP_DIR="" | 15 | #EXP_DIR="" |
16 | # Metas file with type values | 16 | # Metas file with type values |
17 | #METAS_TYPE="" | 17 | #METAS_TYPE="" |
18 | # Metas file with character values | 18 | # Metas file with character values |
19 | #METAS_CHARACTER="" | 19 | #METAS_CHARACTER="" |
20 | 20 | ||
21 | 21 | ||
22 | #echo "VECTOR FILE: $VECTOR_FILE" | 22 | #echo "VECTOR FILE: $VECTOR_FILE" |
23 | #echo "TRAIN LIST: $TRAIN_LST" | 23 | #echo "TRAIN LIST: $TRAIN_LST" |
24 | #echo "VAL LIST: $VAL_LST" | 24 | #echo "VAL LIST: $VAL_LST" |
25 | #echo "EXP DIR: $EXP_DIR" | 25 | #echo "EXP DIR: $EXP_DIR" |
26 | #echo "METAS TYPE: $METAS_TYPE" | 26 | #echo "METAS TYPE: $METAS_TYPE" |
27 | #echo "METAS_CHARACTER: $METAS_CHARACTER" | 27 | #echo "METAS_CHARACTER: $METAS_CHARACTER" |
28 | 28 | ||
29 | 29 | ||
30 | 30 | ||
31 | # -- TRAIN KMEANS | 31 | # -- TRAIN KMEANS |
32 | echo "Clustering - ${kfold}" | 32 | echo "Clustering - ${kfold}" |
33 | python3 bin/cluster_kmeans.py "${VECTOR_FILE}" \ | 33 | python3 bin/cluster_kmeans.py "${VECTOR_FILE}" \ |
34 | "${TRAIN_LST}" \ | 34 | "${TRAIN_LST}" \ |
35 | "${EXP_DIR}" --kmin ${KMIN} --kmax ${KMAX} | 35 | "${EXP_DIR}" --kmin ${KMIN} --kmax ${KMAX} |
36 | 36 | ||
37 | 37 | ||
38 | 38 | ||
39 | for k in $(seq ${KMIN} 1 ${KMAX}) | 39 | for k in $(seq ${KMIN} 1 ${KMAX}) |
40 | do | 40 | do |
41 | SUB_EXP_DIR="${EXP_DIR}/${k}" | 41 | SUB_EXP_DIR="${EXP_DIR}/${k}" |
42 | 42 | ||
43 | # -- EXTRACT KMEANS VALUES | 43 | # -- EXTRACT KMEANS VALUES |
44 | echo "Kmeans Measuring and extraction - ${k}" | 44 | echo "Kmeans Measuring and extraction - ${k}" |
45 | python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ | 45 | python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ |
46 | "${VECTOR_FILE}" \ | 46 | "${VECTOR_FILE}" \ |
47 | --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" | 47 | --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" |
48 | # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR | 48 | # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR |
49 | # Measures | 49 | # Measures |
50 | python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | 50 | python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ |
51 | "${METAS_CHARACTER}" \ | 51 | "${METAS_CHARACTER}" \ |
52 | "${TRAIN_LST}" \ | 52 | "${TRAIN_LST}" \ |
53 | "${VAL_LST}" \ | 53 | "${VAL_LST}" \ |
54 | --outfile "${SUB_EXP_DIR}/measures.json" | 54 | --outfile "${SUB_EXP_DIR}/measures.json" |
55 | 55 | ||
56 | # Plot count matrix for train | 56 | # Plot count matrix for train |
57 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | 57 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ |
58 | ${VECTOR_FILE} \ | 58 | ${VECTOR_FILE} \ |
59 | ${TRAIN_LST} \ | 59 | ${TRAIN_LST} \ |
60 | --outfile "${SUB_EXP_DIR}/train_count_matrix.pdf" | 60 | --outfile "${SUB_EXP_DIR}/train_count_matrix.pdf" |
61 | 61 | ||
62 | # Plot count matrix for val | 62 | # Plot count matrix for val |
63 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | 63 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ |
64 | ${VECTOR_FILE} \ | 64 | ${VECTOR_FILE} \ |
65 | ${VAL_LST} \ | 65 | ${VAL_LST} \ |
66 | --outfile "${SUB_EXP_DIR}/val_count_matrix.pdf" | 66 | --outfile "${SUB_EXP_DIR}/val_count_matrix.pdf" |
67 | 67 | ||
68 | # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR | 68 | # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR |
69 | # Measures | 69 | # Measures |
70 | python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | 70 | python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ |
71 | "${METAS_TYPE}" \ | 71 | "${METAS_TYPE}" \ |
72 | "${TRAIN_LST}" \ | 72 | "${TRAIN_LST}" \ |
73 | "${VAL_LST}" \ | 73 | "${VAL_LST}" \ |
74 | --outfile "${SUB_EXP_DIR}/measures_type.json" | 74 | --outfile "${SUB_EXP_DIR}/measures_type.json" |
75 | 75 | ||
76 | # This script plot the count matrix of the train set | 76 | # This script plot the count matrix of the train set |
77 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | 77 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ |
78 | "${METAS_TYPE}" \ | 78 | "${METAS_TYPE}" \ |
79 | "${TRAIN_LST}" \ | 79 | "${TRAIN_LST}" \ |
80 | --outfile "${SUB_EXP_DIR}/train_count_matrix_type.pdf" | 80 | --outfile "${SUB_EXP_DIR}/train_count_matrix_type.pdf" |
81 | 81 | ||
82 | # This script plot the count matrix of the validation set | 82 | # This script plot the count matrix of the validation set |
83 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | 83 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ |
84 | "${METAS_TYPE}" \ | 84 | "${METAS_TYPE}" \ |
85 | "${VAL_LST}" \ | 85 | "${VAL_LST}" \ |
86 | --outfile "${SUB_EXP_DIR}/val_count_matrix_type.pdf" | 86 | --outfile "${SUB_EXP_DIR}/val_count_matrix_type.pdf" |
87 | 87 | ||
88 | |||
89 | # -- MEASURES AND PLOT WITH RESPECT TO LANG VAR | ||
90 | # Measures | ||
91 | python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | ||
92 | "${METAS_LANG}" \ | ||
93 | "${TRAIN_LST}" \ | ||
94 | "${VAL_LST}" \ | ||
95 | --outfile "${SUB_EXP_DIR}/measures_lang.json" | ||
96 | |||
97 | # This script plot the count matrix of the train set | ||
98 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | ||
99 | "${METAS_LANG}" \ | ||
100 | "${TRAIN_LST}" \ | ||
101 | --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf" | ||
102 | |||
103 | # This script plot the count matrix of the validation set | ||
104 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | ||
105 | "${METAS_LANG}" \ | ||
106 | "${VAL_LST}" \ | ||
107 | --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf" | ||
108 | |||
88 | done | 109 | done |
89 | 110 | ||
90 | 111 |
run-measures.sh
1 | # Pour le moment, le run ne fait qu'executer | 1 | # Pour le moment, le run ne fait qu'executer |
2 | # quelques petites commandes que l'on souhaite | 2 | # quelques petites commandes que l'on souhaite |
3 | # tester. | 3 | # tester. |
4 | 4 | ||
5 | OUTDIR="exp/kmeans_teacher_1/pvector-1" | 5 | OUTDIR="exp/kmeans_euclidian/teacher-pvector-1" |
6 | EXP_DIR=${OUTDIR} | ||
6 | DATADIR="data" | 7 | DATADIR="data" |
7 | NEW_LSTDIR="${OUTDIR}/lst" | 8 | NEW_LSTDIR="${OUTDIR}/lst" |
8 | 9 | ||
9 | kmin=2 | 10 | kmin=2 |
10 | kmax=100 | 11 | kmax=100 |
11 | 12 | ||
12 | if [ ! -d "$OUTDIR" ]; | 13 | if [ ! -d "$OUTDIR" ]; |
13 | then | 14 | then |
14 | mkdir -p $OUTDIR | 15 | mkdir -p $OUTDIR |
15 | fi | 16 | fi |
16 | 17 | ||
17 | if [ ! -d "$NEW_LSTDIR" ]; | 18 | if [ ! -d "$NEW_LSTDIR" ]; |
18 | then | 19 | then |
19 | mkdir -p $NEW_LSTDIR | 20 | mkdir -p $NEW_LSTDIR |
20 | fi | 21 | fi |
21 | 22 | ||
22 | for kfold in {1..4} | 23 | for kfold in {1..4} |
23 | do | 24 | do |
24 | pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" | 25 | pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" |
26 | VECTOR_FILE=$pvector_file | ||
25 | lst_dir="${DATADIR}/pvectors_1rst/lst" | 27 | lst_dir="${DATADIR}/pvectors_1rst/lst" |
26 | output_kfold="${OUTDIR}/${kfold}" | 28 | output_kfold="${OUTDIR}/${kfold}" |
27 | 29 | ||
28 | #python3 "bin/replace_label.py" \ | 30 | #python3 "bin/replace_label.py" \ |
29 | # "${DATADIR}/masseffect.lst" \ | 31 | # "${DATADIR}/masseffect.lst" \ |
30 | # "${DATADIR}/character_information.csv" \ | 32 | # "${DATADIR}/character_information.csv" \ |
31 | # --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \ | 33 | # --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \ |
32 | # --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst" | 34 | # --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst" |
33 | 35 | ||
34 | #python3 "bin/replace_label.py" \ | 36 | #python3 "bin/replace_label.py" \ |
35 | # "${DATADIR}/masseffect.lst" \ | 37 | # "${DATADIR}/masseffect.lst" \ |
36 | # "${DATADIR}/character_information.csv" \ | 38 | # "${DATADIR}/character_information.csv" \ |
37 | # --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \ | 39 | # --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \ |
38 | # --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst" | 40 | # --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst" |
39 | 41 | ||
40 | #cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst" | 42 | #cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst" |
43 | TRAIN_LST=${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst | ||
44 | VAL_LST=${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst | ||
45 | TRAIN_LANG_LST=${NEW_LSTDIR}/train_${kfold}_lang.lst | ||
46 | VAL_LANG_LST=${NEW_LSTDIR}/val_${kfold}_lang.lst | ||
47 | METAS_LANG=${NEW_LSTDIR}/metas_${kfold}_lang.lst | ||
48 | |||
49 | # EXTRACT LANGUAGE INFORMATION | ||
50 | awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST} | ||
51 | echo "VAL EXTRACT LANGUAGE INFO DONE" | ||
52 | awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST} | ||
53 | echo "TRAIN EXTRACT LANGUAGE INFO DONE" | ||
54 | cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}" | ||
55 | echo "GLOBAL EXTRACT LANGUAGE INFO DONE" | ||
56 | |||
41 | 57 | ||
42 | |||
43 | echo "Clustering - ${kfold}" | 58 | echo "Clustering - ${kfold}" |
44 | 59 | ||
45 | for k in $(seq ${kmin} 1 ${kmax}) | 60 | for k in $(seq ${kmin} 1 ${kmax}) |
46 | do | 61 | do |
47 | echo "Kmeans Measuring and ploting - ${k}" | 62 | echo "Kmeans Measuring and ploting - ${k}" |
48 | 63 | ||
49 | # This script compute measures from clustering | 64 | SUB_EXP_DIR="${EXP_DIR}/${kfold}/${k}" |
50 | #python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json" | 65 | |
51 | 66 | # -- EXTRACT CLUSTERING LABELS | |
67 | python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ | ||
68 | "${VECTOR_FILE}" \ | ||
69 | --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" | ||
70 | |||
71 | # -- MEASURES AND PLOT WITH RESPECT TO LANG VAR | ||
72 | # Measures | ||
73 | python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | ||
74 | "${METAS_LANG}" \ | ||
75 | "${TRAIN_LST}" \ | ||
76 | "${VAL_LST}" \ | ||
77 | --outfile "${SUB_EXP_DIR}/measures_lang.json" | ||
78 | |||
79 | # This script plot the count matrix of the train set | ||
80 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | ||
81 | "${METAS_LANG}" \ | ||
82 | "${TRAIN_LST}" \ | ||
83 | --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf" | ||
84 | |||
85 | # This script plot the count matrix of the validation set | ||
86 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | ||
87 | "${METAS_LANG}" \ | ||
88 | "${VAL_LST}" \ | ||
89 | --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf" | ||
90 | |||
91 | rm ${SUB_EXP_DIR}/clustered_${k}.txt | ||
52 | #python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \ | 92 | #python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \ |
53 | # "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \ | 93 | # "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \ |
54 | # "${lst_dir}/val_${kfold}.lst" \ | 94 | # "${lst_dir}/val_${kfold}.lst" \ |
55 | # --outfile "${output_kfold}/${k}/measures_type.json" | 95 | # --outfile "${output_kfold}/${k}/measures_type.json" |
56 | 96 | ||
57 | # This script plot the count matrix of the train set | 97 | # This script plot the count matrix of the train set |
58 | python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | 98 | #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ |
59 | ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \ | 99 | # ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \ |
60 | --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf | 100 | # --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf |
61 | 101 | ||
62 | # This script plot the count matrix of the validation set | 102 | # This script plot the count matrix of the validation set |
63 | python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | 103 | #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ |
64 | ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \ | 104 | # ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \ |
65 | --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf | 105 | # --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf |
66 | 106 | ||
67 | # This script plot the count matrix of the train set | 107 | # This script plot the count matrix of the train set |
68 | python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | 108 | #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ |
69 | ${pvector_file} ${lst_dir}/train_${kfold}.lst \ | 109 | # ${pvector_file} ${lst_dir}/train_${kfold}.lst \ |
70 | --outfile ${output_kfold}/${k}/train_count_matrix.pdf | 110 | # --outfile ${output_kfold}/${k}/train_count_matrix.pdf |
71 | 111 | ||
72 | # This script plot the count matrix of the validation set | 112 | # This script plot the count matrix of the validation set |
73 | python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | 113 | #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ |
74 | ${pvector_file} ${lst_dir}/val_${kfold}.lst \ | 114 | # ${pvector_file} ${lst_dir}/val_${kfold}.lst \ |
75 | --outfile ${output_kfold}/${k}/val_count_matrix.pdf | 115 | # --outfile ${output_kfold}/${k}/val_count_matrix.pdf |
76 | done | 116 | done |
77 | done | 117 | done |
78 | 118 |
run.sh
1 | 1 | ||
2 | #OUTDIR="exp/test/pvector-2" | 2 | #OUTDIR="exp/test/pvector-2" |
3 | #DATADIR="data" | 3 | #DATADIR="data" |
4 | #NEW_LSTDIR="${OUTDIR}/lst" | 4 | #NEW_LSTDIR="${OUTDIR}/lst" |
5 | 5 | ||
6 | #VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" | 6 | #VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" |
7 | #VECTOR_FILES_END=".txt" | 7 | #VECTOR_FILES_END=".txt" |
8 | #VECTOR_FILE="" # To specify if there's only one | 8 | #VECTOR_FILE="" # To specify if there's only one |
9 | #VECTOR_FILES_ONE=false # Specify there's only one file | 9 | #VECTOR_FILES_ONE=false # Specify there's only one file |
10 | 10 | ||
11 | #KMIN=2 | 11 | #KMIN=2 |
12 | #KMAX=100 | 12 | #KMAX=100 |
13 | 13 | ||
14 | # -- LOAD CONFIG FILE | 14 | # -- LOAD CONFIG FILE |
15 | CONFIG_FILE="config.sh" | 15 | CONFIG_FILE="config.sh" |
16 | 16 | ||
17 | if [ $# -eq 1 ] | 17 | if [ $# -eq 1 ] |
18 | then | 18 | then |
19 | CONFIG_FILE="$1" | 19 | CONFIG_FILE="$1" |
20 | else | 20 | else |
21 | echo "Need to have one and only one argument" | 21 | echo "Need to have one and only one argument" |
22 | exit -1 | 22 | exit -1 |
23 | fi | 23 | fi |
24 | 24 | ||
25 | source $CONFIG_FILE | 25 | source $CONFIG_FILE |
26 | 26 | ||
27 | # -- DEFAULTS VALUES CONFIGURATION | 27 | # -- DEFAULTS VALUES CONFIGURATION |
28 | if [ -z "$VECTOR_FILES_ONE" ] | 28 | if [ -z "$VECTOR_FILES_ONE" ] |
29 | then | 29 | then |
30 | VECTOR_FILES_ONE=false | 30 | VECTOR_FILES_ONE=false |
31 | fi | 31 | fi |
32 | 32 | ||
33 | 33 | ||
34 | 34 | ||
35 | # -- MAKE DIRECTORIES | 35 | # -- MAKE DIRECTORIES |
36 | if [ ! -d "$OUTDIR" ]; | 36 | if [ ! -d "$OUTDIR" ]; |
37 | then | 37 | then |
38 | mkdir -p $OUTDIR | 38 | mkdir -p $OUTDIR |
39 | fi | 39 | fi |
40 | 40 | ||
41 | if [ ! -d "${NEW_LSTDIR}" ]; | 41 | if [ ! -d "${NEW_LSTDIR}" ]; |
42 | then | 42 | then |
43 | mkdir -p ${NEW_LSTDIR} | 43 | mkdir -p ${NEW_LSTDIR} |
44 | fi | 44 | fi |
45 | 45 | ||
46 | 46 | ||
47 | # -- KFOLD MIN and MAX | ||
48 | if [ -z "$MIN_KFOLD" ] | ||
49 | then | ||
50 | MIN_KFOLD=1 | ||
51 | fi | ||
52 | |||
53 | if [ -z "$MAX_KFOLD" ] | ||
54 | then | ||
55 | MAX_KFOLD=4 | ||
56 | fi | ||
57 | |||
47 | # -- BEGIN BY KFOLD | 58 | # -- BEGIN BY KFOLD |
48 | for kfold in {1..4} | 59 | for kfold in $(seq ${MIN_KFOLD} ${MAX_KFOLD}) |
49 | do | 60 | do |
50 | # Some usefull variable | 61 | # Some usefull variable |
51 | CHAR_INFO="${DATADIR}/character_information.csv" | 62 | CHAR_INFO="${DATADIR}/character_information.csv" |
52 | TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst" | 63 | TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst" |
53 | VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst" | 64 | VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst" |
65 | TRAIN_LANG_LST="${NEW_LSTDIR}/train_${kfold}_lang.lst" | ||
66 | VAL_LANG_LST="${NEW_LSTDIR}/val_${kfold}_lang.lst" | ||
54 | 67 | ||
55 | # Configuration for the run clustering file | 68 | # Configuration for the run clustering file |
56 | if [ ${VECTOR_FILES_ONE} == false ] | 69 | if [ ${VECTOR_FILES_ONE} == false ] |
57 | then | 70 | then |
58 | VECTOR_FILE="${VECTOR_FILES_BEGIN}_${kfold}${VECTOR_FILES_END}" | 71 | VECTOR_FILE="${VECTOR_FILES_BEGIN}_${kfold}${VECTOR_FILES_END}" |
59 | fi | 72 | fi |
60 | 73 | ||
61 | TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst" | 74 | TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst" |
62 | VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst" | 75 | VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst" |
63 | EXP_DIR="${OUTDIR}/${kfold}" | 76 | EXP_DIR="${OUTDIR}/${kfold}" |
64 | METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst" #* | 77 | METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst" |
65 | METAS_CHARACTER="${DATADIR}/masseffect.lst" | 78 | METAS_CHARACTER="${DATADIR}/masseffect.lst" |
66 | 79 | METAS_LANG="${NEW_LSTDIR}/metas_${kfold}_lang.lst" | |
67 | 80 | ||
68 | 81 | ||
69 | if [ ! -d "${EXP_DIR}" ]; | 82 | if [ ! -d "${EXP_DIR}" ]; |
70 | then | 83 | then |
71 | mkdir -p ${EXP_DIR} | 84 | mkdir -p ${EXP_DIR} |
72 | fi | 85 | fi |
73 | 86 | ||
74 | 87 | ||
75 | # Extract character information | 88 | # EXTRACT TYPE INFORMATION |
76 | echo "Extracting character information" | 89 | echo "Extracting character information" |
90 | echo "Replace in train" | ||
77 | python3 "bin/replace_label.py" \ | 91 | python3 "bin/replace_label.py" \ |
78 | "${METAS_CHARACTER}" \ | 92 | "${METAS_CHARACTER}" \ |
79 | "${CHAR_INFO}" \ | 93 | "${CHAR_INFO}" \ |
80 | --field "type" \ | 94 | --field "type" \ |
81 | --lst "${TRAIN_LST}" \ | 95 | --lst "${TRAIN_LST}" \ |
82 | --outfile "${TRAIN_TYPE_LST}" | 96 | --outfile "${TRAIN_TYPE_LST}" |
83 | 97 | ||
98 | echo "Replace in val" | ||
84 | python3 "bin/replace_label.py" \ | 99 | python3 "bin/replace_label.py" \ |
85 | "${METAS_CHARACTER}" \ | 100 | "${METAS_CHARACTER}" \ |
86 | "${CHAR_INFO}" \ | 101 | "${CHAR_INFO}" \ |
87 | --field "type" \ | 102 | --field "type" \ |
88 | --lst "${VAL_LST}" \ | 103 | --lst "${VAL_LST}" \ |
89 | --outfile "${VAL_TYPE_LST}" | 104 | --outfile "${VAL_TYPE_LST}" |
90 | 105 | ||
106 | echo "Merge them" | ||
91 | cat "${TRAIN_TYPE_LST}" "${VAL_TYPE_LST}" > "${METAS_TYPE}" | 107 | cat "${TRAIN_TYPE_LST}" "${VAL_TYPE_LST}" > "${METAS_TYPE}" |
92 | 108 | ||
109 | # EXTRACT LANGUAGE INFORMATION | ||
110 | echo "Language info for train" | ||
111 | awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST} | ||
112 | echo "Language info for val" | ||
113 | awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST} | ||
114 | |||
115 | echo "Merge them" | ||
116 | cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}" | ||
117 | |||
118 | echo "Then Run Clustering" | ||
93 | source "run-clustering.sh" | 119 | source "run-clustering.sh" |
94 | done | 120 | done |
95 | 121 | ||
96 | # Regroup measures with respect to character classes | 122 | # Regroup measures with respect to character classes |
97 | echo "Regrouping measures with respect to character classes" | 123 | echo "Regrouping measures with respect to character classes" |
98 | python3 "bin/regroup-measures.py" ${OUTDIR} | 124 | python3 "bin/regroup-measures.py" ${OUTDIR} |
99 | 125 | ||
100 | # Regroup measures with respect to type classes | 126 | # Regroup measures with respect to type classes |
101 | echo "Regrouping measures with respect to type classes" | 127 | echo "Regrouping measures with respect to type classes" |
102 | python3 "bin/regroup-measures.py" ${OUTDIR} --suffix "_type" --measurefile "measures_type.json" | 128 | python3 "bin/regroup-measures.py" ${OUTDIR} --suffix "_type" --measurefile "measures_type.json" |
103 | 129 | ||
104 | 130 | ||
105 | 131 |