Commit 29644ae6c3069921ff6c3992f3f2d7e1de09e999

Authored by Mathias Quillot
1 parent ca0fcf2c3f
Exists in master

New receip to build clustering from pvector file, and measure with entropy, comp…

…letness, homogeneity, v measure, and disequilibrium

Showing 1 changed file with 59 additions and 4 deletions Inline Diff

1 # Pour le moment, le run ne fait qu'executer 1 # Pour le moment, le run ne fait qu'executer
2 # quelques petites commandes que l'on souhaite 2 # quelques petites commandes que l'on souhaite
3 # tester. 3 # tester.
4 4
5 OUTDIR="exp/kmeans_teacher_1/pvector-1" 5 OUTDIR="exp/kmeans_teacher_1/pvector-1"
6 DATADIR="data" 6 DATADIR="data"
7 NEW_LSTDIR="${OUTDIR}/lst"
7 8
8 kmin=2 9 kmin=2
9 kmax=100 10 kmax=100
10 11
11 if [ ! -d "$OUTDIR" ]; 12 if [ ! -d "$OUTDIR" ];
12 then 13 then
13 mkdir -p $OUTDIR 14 mkdir -p $OUTDIR
14 fi 15 fi
15 16
16 for kfold in {1..4} 17 if [ ! -d "${NEW_LSTDIR}" ];
18 then
19 mkdir -p ${NEW_LSTDIR}
20 fi
21
22 for kfold in 4 #..4}
17 do 23 do
18 #echo "kfold = ${kfold}" 24 #echo "kfold = ${kfold}"
19 pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" 25 pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt"
20 lst_dir="${DATADIR}/pvectors_1rst/lst" 26 lst_dir="${DATADIR}/pvectors_1rst/lst"
21 output_kfold="${OUTDIR}/${kfold}" 27 output_kfold="${OUTDIR}/${kfold}"
22 28
23 if [ ! -d "${output_kfold}" ]; 29 if [ ! -d "${output_kfold}" ];
24 then 30 then
25 mkdir -p ${output_kfold} 31 mkdir -p ${output_kfold}
26 fi 32 fi
27 33
28 34
29 # Train kmeans 35 # Extract character information
36 echo "Extracting character information"
37 python3 "bin/replace_label.py" \
38 "${DATADIR}/masseffect.lst" \
39 "${DATADIR}/character_information.csv" \
40 --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \
41 --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst"
42
43 python3 "bin/replace_label.py" \
44 "${DATADIR}/masseffect.lst" \
45 "${DATADIR}/character_information.csv" \
46 --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \
47 --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst"
48 cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst"
49
50 # -- TRAIN KMEANS
30 echo "Clustering - ${kfold}" 51 echo "Clustering - ${kfold}"
31 python3 bin/cluster_kmeans.py "${pvector_file}" \ 52 python3 bin/cluster_kmeans.py "${pvector_file}" \
32 "${lst_dir}/train_${kfold}.lst" \ 53 "${lst_dir}/train_${kfold}.lst" \
33 "${output_kfold}" --kmin ${kmin} --kmax ${kmax} 54 "${output_kfold}" --kmin ${kmin} --kmax ${kmax}
34 55
35 for k in $(seq ${kmin} 1 ${kmax}) 56 for k in $(seq ${kmin} 1 ${kmax})
36 do 57 do
58 # -- EXTRACT KMEANS VALUES
37 echo "Kmeans Measuring and extraction - ${k}" 59 echo "Kmeans Measuring and extraction - ${k}"
38 # Extract kmean values
39 python3 bin/extract_kmeans.py "${output_kfold}/${k}/clustering_${k}.pkl" \ 60 python3 bin/extract_kmeans.py "${output_kfold}/${k}/clustering_${k}.pkl" \
40 "${pvector_file}" \ 61 "${pvector_file}" \
41 --outfile "${output_kfold}/${k}/clustered_${k}.txt" 62 --outfile "${output_kfold}/${k}/clustered_${k}.txt"
63
64
65 # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR
66 # Measures
42 python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json" 67 python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json"
68
69 # Plot count matrix for train
70 python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
71 ${pvector_file} ${lst_dir}/train_${kfold}.lst \
72 --outfile ${output_kfold}/${k}/train_count_matrix.pdf
73
74 # Plot count matrix for val
75 python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
76 ${pvector_file} ${lst_dir}/val_${kfold}.lst \
77 --outfile ${output_kfold}/${k}/val_count_matrix.pdf
78
79 # Regroup measures with respect to character var
80 python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/
81
82 # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR
83 # Measures
84 python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures_type.json"
85
86 # This script plot the count matrix of the train set
87 python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
88 ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \
89 --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf
90
91 # This script plot the count matrix of the validation set
92 python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
93 ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \
94 --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf
95
96 # Regroup measures with respect to type var
97 python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/ --suffix "_type" --measurefile "measures_type.j
43 done 98 done
44 done 99 done
45 100