From 29644ae6c3069921ff6c3992f3f2d7e1de09e999 Mon Sep 17 00:00:00 2001 From: Mathias Quillot Date: Mon, 22 Jul 2019 12:12:10 +0200 Subject: [PATCH] New receip to build clustering from pvector file, and measure with entropy, completness, homogeneity, v measure, and disequilibrium --- run.sh | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 59 insertions(+), 4 deletions(-) diff --git a/run.sh b/run.sh index 40433b2..5def9ac 100755 --- a/run.sh +++ b/run.sh @@ -4,6 +4,7 @@ OUTDIR="exp/kmeans_teacher_1/pvector-1" DATADIR="data" +NEW_LSTDIR="${OUTDIR}/lst" kmin=2 kmax=100 @@ -13,7 +14,12 @@ then mkdir -p $OUTDIR fi -for kfold in {1..4} +if [ ! -d "${NEW_LSTDIR}" ]; +then + mkdir -p ${NEW_LSTDIR} +fi + +for kfold in 4 #..4} do #echo "kfold = ${kfold}" pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" @@ -24,9 +30,24 @@ do then mkdir -p ${output_kfold} fi - + + + # Extract character information + echo "Extracting character information" + python3 "bin/replace_label.py" \ + "${DATADIR}/masseffect.lst" \ + "${DATADIR}/character_information.csv" \ + --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \ + --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst" - # Train kmeans + python3 "bin/replace_label.py" \ + "${DATADIR}/masseffect.lst" \ + "${DATADIR}/character_information.csv" \ + --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \ + --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst" + cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst" + + # -- TRAIN KMEANS echo "Clustering - ${kfold}" python3 bin/cluster_kmeans.py "${pvector_file}" \ "${lst_dir}/train_${kfold}.lst" \ @@ -34,12 +55,46 @@ do for k in $(seq ${kmin} 1 ${kmax}) do + # -- EXTRACT KMEANS VALUES echo "Kmeans Measuring and extraction - ${k}" - # Extract kmean values python3 bin/extract_kmeans.py "${output_kfold}/${k}/clustering_${k}.pkl" \ "${pvector_file}" \ --outfile "${output_kfold}/${k}/clustered_${k}.txt" + + + # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR + # Measures python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json" + + # Plot count matrix for train + python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ + ${pvector_file} ${lst_dir}/train_${kfold}.lst \ + --outfile ${output_kfold}/${k}/train_count_matrix.pdf + + # Plot count matrix for val + python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ + ${pvector_file} ${lst_dir}/val_${kfold}.lst \ + --outfile ${output_kfold}/${k}/val_count_matrix.pdf + + # Regroup measures with respect to character var + python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/ + + # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR + # Measures + python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures_type.json" + + # This script plot the count matrix of the train set + python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ + ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \ + --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf + + # This script plot the count matrix of the validation set + python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ + ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \ + --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf + + # Regroup measures with respect to type var + python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/ --suffix "_type" --measurefile "measures_type.j done done -- 1.8.2.3