Commit 95142dfdc54218f17529b6757ed7f310811b9534
1 parent
0ab563604a
Exists in
master
maj. No comment
Showing 7 changed files with 135 additions and 29 deletions Side-by-side Diff
bin/replace_label_lst.py
config/pv_from_xv_config.sh
1 | + | |
2 | +# Framework configuration | |
3 | +OUTDIR="exp/kmeans_euclidian/pv_from_xv" | |
4 | +DATADIR="data" | |
5 | +NEW_LSTDIR="${OUTDIR}/lst" | |
6 | + | |
7 | +VECTOR_FILES_BEGIN="${DATADIR}/pv_from_xv/me_pv_teacher" | |
8 | +VECTOR_FILES_END=".txt" | |
9 | +VECTOR_FILE="" # To specify if there's only one | |
10 | +VECTOR_FILES_ONE=false # Specify there's only one file | |
11 | + | |
12 | +KMIN=2 | |
13 | +KMAX=100 |
config/pvector_config.sh
extract-labels.sh
... | ... | @@ -2,20 +2,20 @@ |
2 | 2 | |
3 | 3 | # Number of set |
4 | 4 | k=4 |
5 | +kmean=88 | |
5 | 6 | |
7 | + | |
6 | 8 | # Vector features file |
7 | -VECTOR_FILE_MASSEFFECT="data/pvectors_1rst/pvectors_teacher_${k}.txt" | |
9 | +VECTOR_FILE_MASSEFFECT="data/xvectors.txt" | |
8 | 10 | |
9 | -# Number of clusters | |
10 | -kmean=6 | |
11 | 11 | |
12 | 12 | # Dirs |
13 | -EXP_DIR="exp/kmeans_euclidian/teacher-pvector-1/${k}/${kmean}" | |
13 | +EXP_DIR="exp/kmeans_euclidian/xvectors/${k}/${kmean}" | |
14 | 14 | CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl" |
15 | 15 | |
16 | 16 | |
17 | 17 | # Output dirs |
18 | -OUTFILE_MASSEFFECT="data/pvectors_1rst/saved_clustered/masseffect_clustered_${k}_${kmean}.txt" | |
18 | +OUTFILE_MASSEFFECT="data/xvectors/saved_clustered/masseffect_clustered_xvectors_${k}_${kmean}.txt" | |
19 | 19 | |
20 | 20 | python3 bin/extract_kmeans.py "${CLUSTERING}" \ |
21 | 21 | "${VECTOR_FILE_MASSEFFECT}" \ |
run-clustering.sh
... | ... | @@ -84,6 +84,27 @@ |
84 | 84 | "${METAS_TYPE}" \ |
85 | 85 | "${VAL_LST}" \ |
86 | 86 | --outfile "${SUB_EXP_DIR}/val_count_matrix_type.pdf" |
87 | - | |
87 | + | |
88 | + | |
89 | + # -- MEASURES AND PLOT WITH RESPECT TO LANG VAR | |
90 | + # Measures | |
91 | + python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
92 | + "${METAS_LANG}" \ | |
93 | + "${TRAIN_LST}" \ | |
94 | + "${VAL_LST}" \ | |
95 | + --outfile "${SUB_EXP_DIR}/measures_lang.json" | |
96 | + | |
97 | + # This script plot the count matrix of the train set | |
98 | + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
99 | + "${METAS_LANG}" \ | |
100 | + "${TRAIN_LST}" \ | |
101 | + --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf" | |
102 | + | |
103 | + # This script plot the count matrix of the validation set | |
104 | + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
105 | + "${METAS_LANG}" \ | |
106 | + "${VAL_LST}" \ | |
107 | + --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf" | |
108 | + | |
88 | 109 | done |
run-measures.sh
... | ... | @@ -2,7 +2,8 @@ |
2 | 2 | # quelques petites commandes que l'on souhaite |
3 | 3 | # tester. |
4 | 4 | |
5 | -OUTDIR="exp/kmeans_teacher_1/pvector-1" | |
5 | +OUTDIR="exp/kmeans_euclidian/teacher-pvector-1" | |
6 | +EXP_DIR=${OUTDIR} | |
6 | 7 | DATADIR="data" |
7 | 8 | NEW_LSTDIR="${OUTDIR}/lst" |
8 | 9 | |
... | ... | @@ -22,6 +23,7 @@ |
22 | 23 | for kfold in {1..4} |
23 | 24 | do |
24 | 25 | pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" |
26 | + VECTOR_FILE=$pvector_file | |
25 | 27 | lst_dir="${DATADIR}/pvectors_1rst/lst" |
26 | 28 | output_kfold="${OUTDIR}/${kfold}" |
27 | 29 | |
28 | 30 | |
29 | 31 | |
30 | 32 | |
31 | 33 | |
32 | 34 | |
33 | 35 | |
... | ... | @@ -38,41 +40,79 @@ |
38 | 40 | # --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst" |
39 | 41 | |
40 | 42 | #cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst" |
43 | + TRAIN_LST=${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst | |
44 | + VAL_LST=${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst | |
45 | + TRAIN_LANG_LST=${NEW_LSTDIR}/train_${kfold}_lang.lst | |
46 | + VAL_LANG_LST=${NEW_LSTDIR}/val_${kfold}_lang.lst | |
47 | + METAS_LANG=${NEW_LSTDIR}/metas_${kfold}_lang.lst | |
48 | + | |
49 | + # EXTRACT LANGUAGE INFORMATION | |
50 | + awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST} | |
51 | + echo "VAL EXTRACT LANGUAGE INFO DONE" | |
52 | + awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST} | |
53 | + echo "TRAIN EXTRACT LANGUAGE INFO DONE" | |
54 | + cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}" | |
55 | + echo "GLOBAL EXTRACT LANGUAGE INFO DONE" | |
56 | + | |
41 | 57 | |
42 | - | |
43 | 58 | echo "Clustering - ${kfold}" |
44 | 59 | |
45 | 60 | for k in $(seq ${kmin} 1 ${kmax}) |
46 | 61 | do |
47 | 62 | echo "Kmeans Measuring and ploting - ${k}" |
48 | - | |
49 | - # This script compute measures from clustering | |
50 | - #python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json" | |
51 | - | |
63 | + | |
64 | + SUB_EXP_DIR="${EXP_DIR}/${kfold}/${k}" | |
65 | + | |
66 | + # -- EXTRACT CLUSTERING LABELS | |
67 | + python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ | |
68 | + "${VECTOR_FILE}" \ | |
69 | + --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" | |
70 | + | |
71 | + # -- MEASURES AND PLOT WITH RESPECT TO LANG VAR | |
72 | + # Measures | |
73 | + python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
74 | + "${METAS_LANG}" \ | |
75 | + "${TRAIN_LST}" \ | |
76 | + "${VAL_LST}" \ | |
77 | + --outfile "${SUB_EXP_DIR}/measures_lang.json" | |
78 | + | |
79 | + # This script plot the count matrix of the train set | |
80 | + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
81 | + "${METAS_LANG}" \ | |
82 | + "${TRAIN_LST}" \ | |
83 | + --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf" | |
84 | + | |
85 | + # This script plot the count matrix of the validation set | |
86 | + python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
87 | + "${METAS_LANG}" \ | |
88 | + "${VAL_LST}" \ | |
89 | + --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf" | |
90 | + | |
91 | + rm ${SUB_EXP_DIR}/clustered_${k}.txt | |
52 | 92 | #python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \ |
53 | 93 | # "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \ |
54 | 94 | # "${lst_dir}/val_${kfold}.lst" \ |
55 | 95 | # --outfile "${output_kfold}/${k}/measures_type.json" |
56 | 96 | |
57 | 97 | # This script plot the count matrix of the train set |
58 | - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | |
59 | - ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \ | |
60 | - --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf | |
98 | + #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | |
99 | + # ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \ | |
100 | + # --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf | |
61 | 101 | |
62 | 102 | # This script plot the count matrix of the validation set |
63 | - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | |
64 | - ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \ | |
65 | - --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf | |
103 | + #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | |
104 | + # ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \ | |
105 | + # --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf | |
66 | 106 | |
67 | 107 | # This script plot the count matrix of the train set |
68 | - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | |
69 | - ${pvector_file} ${lst_dir}/train_${kfold}.lst \ | |
70 | - --outfile ${output_kfold}/${k}/train_count_matrix.pdf | |
108 | + #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | |
109 | + # ${pvector_file} ${lst_dir}/train_${kfold}.lst \ | |
110 | + # --outfile ${output_kfold}/${k}/train_count_matrix.pdf | |
71 | 111 | |
72 | 112 | # This script plot the count matrix of the validation set |
73 | - python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | |
74 | - ${pvector_file} ${lst_dir}/val_${kfold}.lst \ | |
75 | - --outfile ${output_kfold}/${k}/val_count_matrix.pdf | |
113 | + #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | |
114 | + # ${pvector_file} ${lst_dir}/val_${kfold}.lst \ | |
115 | + # --outfile ${output_kfold}/${k}/val_count_matrix.pdf | |
76 | 116 | done |
77 | 117 | done |
run.sh
... | ... | @@ -44,13 +44,26 @@ |
44 | 44 | fi |
45 | 45 | |
46 | 46 | |
47 | +# -- KFOLD MIN and MAX | |
48 | +if [ -z "$MIN_KFOLD" ] | |
49 | +then | |
50 | + MIN_KFOLD=1 | |
51 | +fi | |
52 | + | |
53 | +if [ -z "$MAX_KFOLD" ] | |
54 | +then | |
55 | + MAX_KFOLD=4 | |
56 | +fi | |
57 | + | |
47 | 58 | # -- BEGIN BY KFOLD |
48 | -for kfold in {1..4} | |
59 | +for kfold in $(seq ${MIN_KFOLD} ${MAX_KFOLD}) | |
49 | 60 | do |
50 | 61 | # Some usefull variable |
51 | 62 | CHAR_INFO="${DATADIR}/character_information.csv" |
52 | 63 | TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst" |
53 | 64 | VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst" |
65 | + TRAIN_LANG_LST="${NEW_LSTDIR}/train_${kfold}_lang.lst" | |
66 | + VAL_LANG_LST="${NEW_LSTDIR}/val_${kfold}_lang.lst" | |
54 | 67 | |
55 | 68 | # Configuration for the run clustering file |
56 | 69 | if [ ${VECTOR_FILES_ONE} == false ] |
57 | 70 | |
... | ... | @@ -61,9 +74,9 @@ |
61 | 74 | TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst" |
62 | 75 | VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst" |
63 | 76 | EXP_DIR="${OUTDIR}/${kfold}" |
64 | - METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst" #* | |
77 | + METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst" | |
65 | 78 | METAS_CHARACTER="${DATADIR}/masseffect.lst" |
66 | - | |
79 | + METAS_LANG="${NEW_LSTDIR}/metas_${kfold}_lang.lst" | |
67 | 80 | |
68 | 81 | |
69 | 82 | if [ ! -d "${EXP_DIR}" ]; |
70 | 83 | |
... | ... | @@ -72,8 +85,9 @@ |
72 | 85 | fi |
73 | 86 | |
74 | 87 | |
75 | - # Extract character information | |
88 | + # EXTRACT TYPE INFORMATION | |
76 | 89 | echo "Extracting character information" |
90 | + echo "Replace in train" | |
77 | 91 | python3 "bin/replace_label.py" \ |
78 | 92 | "${METAS_CHARACTER}" \ |
79 | 93 | "${CHAR_INFO}" \ |
... | ... | @@ -81,6 +95,7 @@ |
81 | 95 | --lst "${TRAIN_LST}" \ |
82 | 96 | --outfile "${TRAIN_TYPE_LST}" |
83 | 97 | |
98 | + echo "Replace in val" | |
84 | 99 | python3 "bin/replace_label.py" \ |
85 | 100 | "${METAS_CHARACTER}" \ |
86 | 101 | "${CHAR_INFO}" \ |
87 | 102 | |
... | ... | @@ -88,8 +103,19 @@ |
88 | 103 | --lst "${VAL_LST}" \ |
89 | 104 | --outfile "${VAL_TYPE_LST}" |
90 | 105 | |
106 | + echo "Merge them" | |
91 | 107 | cat "${TRAIN_TYPE_LST}" "${VAL_TYPE_LST}" > "${METAS_TYPE}" |
92 | - | |
108 | + | |
109 | + # EXTRACT LANGUAGE INFORMATION | |
110 | + echo "Language info for train" | |
111 | + awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST} | |
112 | + echo "Language info for val" | |
113 | + awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST} | |
114 | + | |
115 | + echo "Merge them" | |
116 | + cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}" | |
117 | + | |
118 | + echo "Then Run Clustering" | |
93 | 119 | source "run-clustering.sh" |
94 | 120 | done |
95 | 121 |