Commit 29644ae6c3069921ff6c3992f3f2d7e1de09e999
1 parent
ca0fcf2c3f
Exists in
master
New receip to build clustering from pvector file, and measure with entropy, comp…
…letness, homogeneity, v measure, and disequilibrium
Showing 1 changed file with 59 additions and 4 deletions Inline Diff
run.sh
1 | # Pour le moment, le run ne fait qu'executer | 1 | # Pour le moment, le run ne fait qu'executer |
2 | # quelques petites commandes que l'on souhaite | 2 | # quelques petites commandes que l'on souhaite |
3 | # tester. | 3 | # tester. |
4 | 4 | ||
5 | OUTDIR="exp/kmeans_teacher_1/pvector-1" | 5 | OUTDIR="exp/kmeans_teacher_1/pvector-1" |
6 | DATADIR="data" | 6 | DATADIR="data" |
7 | NEW_LSTDIR="${OUTDIR}/lst" | ||
7 | 8 | ||
8 | kmin=2 | 9 | kmin=2 |
9 | kmax=100 | 10 | kmax=100 |
10 | 11 | ||
11 | if [ ! -d "$OUTDIR" ]; | 12 | if [ ! -d "$OUTDIR" ]; |
12 | then | 13 | then |
13 | mkdir -p $OUTDIR | 14 | mkdir -p $OUTDIR |
14 | fi | 15 | fi |
15 | 16 | ||
16 | for kfold in {1..4} | 17 | if [ ! -d "${NEW_LSTDIR}" ]; |
18 | then | ||
19 | mkdir -p ${NEW_LSTDIR} | ||
20 | fi | ||
21 | |||
22 | for kfold in 4 #..4} | ||
17 | do | 23 | do |
18 | #echo "kfold = ${kfold}" | 24 | #echo "kfold = ${kfold}" |
19 | pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" | 25 | pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" |
20 | lst_dir="${DATADIR}/pvectors_1rst/lst" | 26 | lst_dir="${DATADIR}/pvectors_1rst/lst" |
21 | output_kfold="${OUTDIR}/${kfold}" | 27 | output_kfold="${OUTDIR}/${kfold}" |
22 | 28 | ||
23 | if [ ! -d "${output_kfold}" ]; | 29 | if [ ! -d "${output_kfold}" ]; |
24 | then | 30 | then |
25 | mkdir -p ${output_kfold} | 31 | mkdir -p ${output_kfold} |
26 | fi | 32 | fi |
27 | 33 | ||
28 | 34 | ||
29 | # Train kmeans | 35 | # Extract character information |
36 | echo "Extracting character information" | ||
37 | python3 "bin/replace_label.py" \ | ||
38 | "${DATADIR}/masseffect.lst" \ | ||
39 | "${DATADIR}/character_information.csv" \ | ||
40 | --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \ | ||
41 | --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst" | ||
42 | |||
43 | python3 "bin/replace_label.py" \ | ||
44 | "${DATADIR}/masseffect.lst" \ | ||
45 | "${DATADIR}/character_information.csv" \ | ||
46 | --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \ | ||
47 | --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst" | ||
48 | cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst" | ||
49 | |||
50 | # -- TRAIN KMEANS | ||
30 | echo "Clustering - ${kfold}" | 51 | echo "Clustering - ${kfold}" |
31 | python3 bin/cluster_kmeans.py "${pvector_file}" \ | 52 | python3 bin/cluster_kmeans.py "${pvector_file}" \ |
32 | "${lst_dir}/train_${kfold}.lst" \ | 53 | "${lst_dir}/train_${kfold}.lst" \ |
33 | "${output_kfold}" --kmin ${kmin} --kmax ${kmax} | 54 | "${output_kfold}" --kmin ${kmin} --kmax ${kmax} |
34 | 55 | ||
35 | for k in $(seq ${kmin} 1 ${kmax}) | 56 | for k in $(seq ${kmin} 1 ${kmax}) |
36 | do | 57 | do |
58 | # -- EXTRACT KMEANS VALUES | ||
37 | echo "Kmeans Measuring and extraction - ${k}" | 59 | echo "Kmeans Measuring and extraction - ${k}" |
38 | # Extract kmean values | ||
39 | python3 bin/extract_kmeans.py "${output_kfold}/${k}/clustering_${k}.pkl" \ | 60 | python3 bin/extract_kmeans.py "${output_kfold}/${k}/clustering_${k}.pkl" \ |
40 | "${pvector_file}" \ | 61 | "${pvector_file}" \ |
41 | --outfile "${output_kfold}/${k}/clustered_${k}.txt" | 62 | --outfile "${output_kfold}/${k}/clustered_${k}.txt" |
63 | |||
64 | |||
65 | # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR | ||
66 | # Measures | ||
42 | python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json" | 67 | python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json" |
68 | |||
69 | # Plot count matrix for train | ||
70 | python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | ||
71 | ${pvector_file} ${lst_dir}/train_${kfold}.lst \ | ||
72 | --outfile ${output_kfold}/${k}/train_count_matrix.pdf | ||
73 | |||
74 | # Plot count matrix for val | ||
75 | python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | ||
76 | ${pvector_file} ${lst_dir}/val_${kfold}.lst \ | ||
77 | --outfile ${output_kfold}/${k}/val_count_matrix.pdf | ||
78 | |||
79 | # Regroup measures with respect to character var | ||
80 | python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/ | ||
81 | |||
82 | # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR | ||
83 | # Measures | ||
84 | python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures_type.json" | ||
85 | |||
86 | # This script plot the count matrix of the train set | ||
87 | python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | ||
88 | ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \ | ||
89 | --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf | ||
90 | |||
91 | # This script plot the count matrix of the validation set | ||
92 | python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | ||
93 | ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \ | ||
94 | --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf | ||
95 | |||
96 | # Regroup measures with respect to type var | ||
97 | python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/ --suffix "_type" --measurefile "measures_type.j | ||
43 | done | 98 | done |
44 | done | 99 | done |
45 | 100 |