New receip to build clustering from pvector file, and measure with entropy, comp…

…letness, homogeneity, v measure, and disequilibrium

New receip to build clustering from pvector file, and measure with entropy, comp…
…letness, homogeneity, v measure, and disequilibrium
Mathias Quillot
1 parent ca0fcf2c3f
Showing 1 changed file with 59 additions and 4 deletions Side-by-side Diff
run.sh
@@ -4,6 +4,7 @@
  
 OUTDIR="exp/kmeans_teacher_1/pvector-1"
 DATADIR="data"
+NEW_LSTDIR="${OUTDIR}/lst"
  
 kmin=2
 kmax=100
@@ -13,7 +14,12 @@
     mkdir -p $OUTDIR
 fi
  
-for kfold in {1..4}
+if [ ! -d "${NEW_LSTDIR}" ];
+then
+    mkdir -p ${NEW_LSTDIR}
+fi
+
+for kfold in 4 #..4}
 do
     #echo "kfold = ${kfold}"
     pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt"
  
@@ -24,9 +30,24 @@
     then
         mkdir -p ${output_kfold}
     fi
-
+        
  
-    # Train kmeans
+    # Extract character information
+    echo "Extracting character information"
+    python3 "bin/replace_label.py" \
+        "${DATADIR}/masseffect.lst" \
+        "${DATADIR}/character_information.csv" \
+        --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \
+        --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst"
+    
+    python3 "bin/replace_label.py" \
+        "${DATADIR}/masseffect.lst" \
+        "${DATADIR}/character_information.csv" \
+        --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \
+        --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst"
+    cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst"
+
+    # -- TRAIN KMEANS 
     echo "Clustering - ${kfold}"
     python3 bin/cluster_kmeans.py "${pvector_file}" \
         "${lst_dir}/train_${kfold}.lst" \
  
  
  
@@ -34,12 +55,46 @@
  
     for k in $(seq ${kmin} 1 ${kmax})
     do
+        # -- EXTRACT KMEANS VALUES
         echo "Kmeans Measuring and extraction - ${k}"
-        # Extract kmean values
         python3 bin/extract_kmeans.py "${output_kfold}/${k}/clustering_${k}.pkl" \
             "${pvector_file}" \
             --outfile "${output_kfold}/${k}/clustered_${k}.txt"
+        
+        
+        # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR
+        # Measures
         python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json"
+        
+        # Plot count matrix for train
+        python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
+        ${pvector_file} ${lst_dir}/train_${kfold}.lst \
+        --outfile ${output_kfold}/${k}/train_count_matrix.pdf
+        
+        # Plot count matrix for val
+        python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
+            ${pvector_file} ${lst_dir}/val_${kfold}.lst \
+            --outfile ${output_kfold}/${k}/val_count_matrix.pdf
+
+        # Regroup measures with respect to character var
+        python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/
+
+        # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR
+        # Measures
+        python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures_type.json"
+        
+        # This script plot the count matrix of the train set
+        python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
+        ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \
+        --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf
+        
+        # This script plot the count matrix of the validation set
+        python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
+        ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \
+        --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf
+
+        # Regroup measures with respect to type var 
+        python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/ --suffix "_type" --measurefile "measures_type.j
     done
 done
...	...	@@ -4,6 +4,7 @@
4	4
5	5	OUTDIR="exp/kmeans_teacher_1/pvector-1"
6	6	DATADIR="data"
	7	+NEW_LSTDIR="${OUTDIR}/lst"
7	8
8	9	kmin=2
9	10	kmax=100
...	...	@@ -13,7 +14,12 @@
13	14	mkdir -p $OUTDIR
14	15	fi
15	16
16		-for kfold in {1..4}
	17	+if [ ! -d "${NEW_LSTDIR}" ];
	18	+then
	19	+ mkdir -p ${NEW_LSTDIR}
	20	+fi
	21	+
	22	+for kfold in 4 #..4}
17	23	do
18	24	#echo "kfold = ${kfold}"
19	25	pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt"
20	26
...	...	@@ -24,9 +30,24 @@
24	30	then
25	31	mkdir -p ${output_kfold}
26	32	fi
27		-
	33	+
28	34
29		- # Train kmeans
	35	+ # Extract character information
	36	+ echo "Extracting character information"
	37	+ python3 "bin/replace_label.py" \
	38	+ "${DATADIR}/masseffect.lst" \
	39	+ "${DATADIR}/character_information.csv" \
	40	+ --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \
	41	+ --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst"
	42	+
	43	+ python3 "bin/replace_label.py" \
	44	+ "${DATADIR}/masseffect.lst" \
	45	+ "${DATADIR}/character_information.csv" \
	46	+ --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \
	47	+ --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst"
	48	+ cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst"
	49	+
	50	+ # -- TRAIN KMEANS
30	51	echo "Clustering - ${kfold}"
31	52	python3 bin/cluster_kmeans.py "${pvector_file}" \
32	53	"${lst_dir}/train_${kfold}.lst" \
33	54
34	55
35	56
...	...	@@ -34,12 +55,46 @@
34	55
35	56	for k in $(seq ${kmin} 1 ${kmax})
36	57	do
	58	+ # -- EXTRACT KMEANS VALUES
37	59	echo "Kmeans Measuring and extraction - ${k}"
38		- # Extract kmean values
39	60	python3 bin/extract_kmeans.py "${output_kfold}/${k}/clustering_${k}.pkl" \
40	61	"${pvector_file}" \
41	62	--outfile "${output_kfold}/${k}/clustered_${k}.txt"
	63	+
	64	+
	65	+ # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR
	66	+ # Measures
42	67	python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json"
	68	+
	69	+ # Plot count matrix for train
	70	+ python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
	71	+ ${pvector_file} ${lst_dir}/train_${kfold}.lst \
	72	+ --outfile ${output_kfold}/${k}/train_count_matrix.pdf
	73	+
	74	+ # Plot count matrix for val
	75	+ python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
	76	+ ${pvector_file} ${lst_dir}/val_${kfold}.lst \
	77	+ --outfile ${output_kfold}/${k}/val_count_matrix.pdf
	78	+
	79	+ # Regroup measures with respect to character var
	80	+ python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/
	81	+
	82	+ # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR
	83	+ # Measures
	84	+ python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures_type.json"
	85	+
	86	+ # This script plot the count matrix of the train set
	87	+ python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
	88	+ ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \
	89	+ --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf
	90	+
	91	+ # This script plot the count matrix of the validation set
	92	+ python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
	93	+ ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \
	94	+ --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf
	95	+
	96	+ # Regroup measures with respect to type var
	97	+ python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/ --suffix "_type" --measurefile "measures_type.j
43	98	done
44	99	done