Commit 29644ae6c3069921ff6c3992f3f2d7e1de09e999

Authored by Mathias Quillot
1 parent ca0fcf2c3f
Exists in master

New receip to build clustering from pvector file, and measure with entropy, comp…

…letness, homogeneity, v measure, and disequilibrium

Showing 1 changed file with 59 additions and 4 deletions Side-by-side Diff

... ... @@ -4,6 +4,7 @@
4 4  
5 5 OUTDIR="exp/kmeans_teacher_1/pvector-1"
6 6 DATADIR="data"
  7 +NEW_LSTDIR="${OUTDIR}/lst"
7 8  
8 9 kmin=2
9 10 kmax=100
... ... @@ -13,7 +14,12 @@
13 14 mkdir -p $OUTDIR
14 15 fi
15 16  
16   -for kfold in {1..4}
  17 +if [ ! -d "${NEW_LSTDIR}" ];
  18 +then
  19 + mkdir -p ${NEW_LSTDIR}
  20 +fi
  21 +
  22 +for kfold in 4 #..4}
17 23 do
18 24 #echo "kfold = ${kfold}"
19 25 pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt"
20 26  
... ... @@ -24,9 +30,24 @@
24 30 then
25 31 mkdir -p ${output_kfold}
26 32 fi
27   -
  33 +
28 34  
29   - # Train kmeans
  35 + # Extract character information
  36 + echo "Extracting character information"
  37 + python3 "bin/replace_label.py" \
  38 + "${DATADIR}/masseffect.lst" \
  39 + "${DATADIR}/character_information.csv" \
  40 + --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \
  41 + --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst"
  42 +
  43 + python3 "bin/replace_label.py" \
  44 + "${DATADIR}/masseffect.lst" \
  45 + "${DATADIR}/character_information.csv" \
  46 + --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \
  47 + --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst"
  48 + cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst"
  49 +
  50 + # -- TRAIN KMEANS
30 51 echo "Clustering - ${kfold}"
31 52 python3 bin/cluster_kmeans.py "${pvector_file}" \
32 53 "${lst_dir}/train_${kfold}.lst" \
33 54  
34 55  
35 56  
... ... @@ -34,12 +55,46 @@
34 55  
35 56 for k in $(seq ${kmin} 1 ${kmax})
36 57 do
  58 + # -- EXTRACT KMEANS VALUES
37 59 echo "Kmeans Measuring and extraction - ${k}"
38   - # Extract kmean values
39 60 python3 bin/extract_kmeans.py "${output_kfold}/${k}/clustering_${k}.pkl" \
40 61 "${pvector_file}" \
41 62 --outfile "${output_kfold}/${k}/clustered_${k}.txt"
  63 +
  64 +
  65 + # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR
  66 + # Measures
42 67 python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json"
  68 +
  69 + # Plot count matrix for train
  70 + python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
  71 + ${pvector_file} ${lst_dir}/train_${kfold}.lst \
  72 + --outfile ${output_kfold}/${k}/train_count_matrix.pdf
  73 +
  74 + # Plot count matrix for val
  75 + python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
  76 + ${pvector_file} ${lst_dir}/val_${kfold}.lst \
  77 + --outfile ${output_kfold}/${k}/val_count_matrix.pdf
  78 +
  79 + # Regroup measures with respect to character var
  80 + python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/
  81 +
  82 + # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR
  83 + # Measures
  84 + python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures_type.json"
  85 +
  86 + # This script plot the count matrix of the train set
  87 + python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
  88 + ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \
  89 + --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf
  90 +
  91 + # This script plot the count matrix of the validation set
  92 + python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
  93 + ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \
  94 + --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf
  95 +
  96 + # Regroup measures with respect to type var
  97 + python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/ --suffix "_type" --measurefile "measures_type.j
43 98 done
44 99 done