Commit 29644ae6c3069921ff6c3992f3f2d7e1de09e999
1 parent
ca0fcf2c3f
Exists in
master
New receip to build clustering from pvector file, and measure with entropy, comp…
…letness, homogeneity, v measure, and disequilibrium
Showing 1 changed file with 59 additions and 4 deletions Side-by-side Diff
run.sh
... | ... | @@ -4,6 +4,7 @@ |
4 | 4 | |
5 | 5 | OUTDIR="exp/kmeans_teacher_1/pvector-1" |
6 | 6 | DATADIR="data" |
7 | +NEW_LSTDIR="${OUTDIR}/lst" | |
7 | 8 | |
8 | 9 | kmin=2 |
9 | 10 | kmax=100 |
... | ... | @@ -13,7 +14,12 @@ |
13 | 14 | mkdir -p $OUTDIR |
14 | 15 | fi |
15 | 16 | |
16 | -for kfold in {1..4} | |
17 | +if [ ! -d "${NEW_LSTDIR}" ]; | |
18 | +then | |
19 | + mkdir -p ${NEW_LSTDIR} | |
20 | +fi | |
21 | + | |
22 | +for kfold in 4 #..4} | |
17 | 23 | do |
18 | 24 | #echo "kfold = ${kfold}" |
19 | 25 | pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" |
20 | 26 | |
... | ... | @@ -24,9 +30,24 @@ |
24 | 30 | then |
25 | 31 | mkdir -p ${output_kfold} |
26 | 32 | fi |
27 | - | |
33 | + | |
28 | 34 | |
29 | - # Train kmeans | |
35 | + # Extract character information | |
36 | + echo "Extracting character information" | |
37 | + python3 "bin/replace_label.py" \ | |
38 | + "${DATADIR}/masseffect.lst" \ | |
39 | + "${DATADIR}/character_information.csv" \ | |
40 | + --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \ | |
41 | + --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst" | |
42 | + | |
43 | + python3 "bin/replace_label.py" \ | |
44 | + "${DATADIR}/masseffect.lst" \ | |
45 | + "${DATADIR}/character_information.csv" \ | |
46 | + --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \ | |
47 | + --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst" | |
48 | + cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst" | |
49 | + | |
50 | + # -- TRAIN KMEANS | |
30 | 51 | echo "Clustering - ${kfold}" |
31 | 52 | python3 bin/cluster_kmeans.py "${pvector_file}" \ |
32 | 53 | "${lst_dir}/train_${kfold}.lst" \ |
33 | 54 | |
34 | 55 | |
35 | 56 | |
... | ... | @@ -34,12 +55,46 @@ |
34 | 55 | |
35 | 56 | for k in $(seq ${kmin} 1 ${kmax}) |
36 | 57 | do |
58 | + # -- EXTRACT KMEANS VALUES | |
37 | 59 | echo "Kmeans Measuring and extraction - ${k}" |
38 | - # Extract kmean values | |
39 | 60 | python3 bin/extract_kmeans.py "${output_kfold}/${k}/clustering_${k}.pkl" \ |
40 | 61 | "${pvector_file}" \ |
41 | 62 | --outfile "${output_kfold}/${k}/clustered_${k}.txt" |
63 | + | |
64 | + | |
65 | + # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR | |
66 | + # Measures | |
42 | 67 | python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json" |
68 | + | |
69 | + # Plot count matrix for train | |
70 | + python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | |
71 | + ${pvector_file} ${lst_dir}/train_${kfold}.lst \ | |
72 | + --outfile ${output_kfold}/${k}/train_count_matrix.pdf | |
73 | + | |
74 | + # Plot count matrix for val | |
75 | + python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | |
76 | + ${pvector_file} ${lst_dir}/val_${kfold}.lst \ | |
77 | + --outfile ${output_kfold}/${k}/val_count_matrix.pdf | |
78 | + | |
79 | + # Regroup measures with respect to character var | |
80 | + python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/ | |
81 | + | |
82 | + # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR | |
83 | + # Measures | |
84 | + python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures_type.json" | |
85 | + | |
86 | + # This script plot the count matrix of the train set | |
87 | + python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | |
88 | + ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \ | |
89 | + --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf | |
90 | + | |
91 | + # This script plot the count matrix of the validation set | |
92 | + python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | |
93 | + ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \ | |
94 | + --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf | |
95 | + | |
96 | + # Regroup measures with respect to type var | |
97 | + python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/ --suffix "_type" --measurefile "measures_type.j | |
43 | 98 | done |
44 | 99 | done |