Quillot Mathias / Clustering

Commit 29644ae6c3069921ff6c3992f3f2d7e1de09e999

Authored by Mathias Quillot 2019-07-22 12:12:10 +0200

Exists in master

New receip to build clustering from pvector file, and measure with entropy, comp…

…letness, homogeneity, v measure, and disequilibrium

Showing 1 changed file with 59 additions and 4 deletions Inline Diff

run.sh

run.sh

Diff comments View file @ 29644ae

1	# Pour le moment, le run ne fait qu'executer	1	# Pour le moment, le run ne fait qu'executer
2	# quelques petites commandes que l'on souhaite	2	# quelques petites commandes que l'on souhaite
3	# tester.	3	# tester.
4		4
5	OUTDIR="exp/kmeans_teacher_1/pvector-1"	5	OUTDIR="exp/kmeans_teacher_1/pvector-1"
6	DATADIR="data"	6	DATADIR="data"
		7	NEW_LSTDIR="${OUTDIR}/lst"
7		8
8	kmin=2	9	kmin=2
9	kmax=100	10	kmax=100
10		11
11	if [ ! -d "$OUTDIR" ];	12	if [ ! -d "$OUTDIR" ];
12	then	13	then
13	mkdir -p $OUTDIR	14	mkdir -p $OUTDIR
14	fi	15	fi
15		16
16	for kfold in {1..4}	17	if [ ! -d "${NEW_LSTDIR}" ];
		18	then
		19	mkdir -p ${NEW_LSTDIR}
		20	fi
		21
		22	for kfold in 4 #..4}
17	do	23	do
18	#echo "kfold = ${kfold}"	24	#echo "kfold = ${kfold}"
19	pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt"	25	pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt"
20	lst_dir="${DATADIR}/pvectors_1rst/lst"	26	lst_dir="${DATADIR}/pvectors_1rst/lst"
21	output_kfold="${OUTDIR}/${kfold}"	27	output_kfold="${OUTDIR}/${kfold}"
22		28
23	if [ ! -d "${output_kfold}" ];	29	if [ ! -d "${output_kfold}" ];
24	then	30	then
25	mkdir -p ${output_kfold}	31	mkdir -p ${output_kfold}
26	fi	32	fi
27		33
28		34
29	# Train kmeans	35	# Extract character information
		36	echo "Extracting character information"
		37	python3 "bin/replace_label.py" \
		38	"${DATADIR}/masseffect.lst" \
		39	"${DATADIR}/character_information.csv" \
		40	--field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \
		41	--outfile "${NEW_LSTDIR}/train_${kfold}_type.lst"
		42
		43	python3 "bin/replace_label.py" \
		44	"${DATADIR}/masseffect.lst" \
		45	"${DATADIR}/character_information.csv" \
		46	--field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \
		47	--outfile "${NEW_LSTDIR}/val_${kfold}_type.lst"
		48	cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst"
		49
		50	# -- TRAIN KMEANS
30	echo "Clustering - ${kfold}"	51	echo "Clustering - ${kfold}"
31	python3 bin/cluster_kmeans.py "${pvector_file}" \	52	python3 bin/cluster_kmeans.py "${pvector_file}" \
32	"${lst_dir}/train_${kfold}.lst" \	53	"${lst_dir}/train_${kfold}.lst" \
33	"${output_kfold}" --kmin ${kmin} --kmax ${kmax}	54	"${output_kfold}" --kmin ${kmin} --kmax ${kmax}
34		55
35	for k in $(seq ${kmin} 1 ${kmax})	56	for k in $(seq ${kmin} 1 ${kmax})
36	do	57	do
		58	# -- EXTRACT KMEANS VALUES
37	echo "Kmeans Measuring and extraction - ${k}"	59	echo "Kmeans Measuring and extraction - ${k}"
38	# Extract kmean values
39	python3 bin/extract_kmeans.py "${output_kfold}/${k}/clustering_${k}.pkl" \	60	python3 bin/extract_kmeans.py "${output_kfold}/${k}/clustering_${k}.pkl" \
40	"${pvector_file}" \	61	"${pvector_file}" \
41	--outfile "${output_kfold}/${k}/clustered_${k}.txt"	62	--outfile "${output_kfold}/${k}/clustered_${k}.txt"
		63
		64
		65	# -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR
		66	# Measures
42	python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json"	67	python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${pvector_file}" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures.json"
		68
		69	# Plot count matrix for train
		70	python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
		71	${pvector_file} ${lst_dir}/train_${kfold}.lst \
		72	--outfile ${output_kfold}/${k}/train_count_matrix.pdf
		73
		74	# Plot count matrix for val
		75	python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
		76	${pvector_file} ${lst_dir}/val_${kfold}.lst \
		77	--outfile ${output_kfold}/${k}/val_count_matrix.pdf
		78
		79	# Regroup measures with respect to character var
		80	python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/
		81
		82	# -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR
		83	# Measures
		84	python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" "${lst_dir}/val_${kfold}.lst" --outfile "${output_kfold}/${k}/measures_type.json"
		85
		86	# This script plot the count matrix of the train set
		87	python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
		88	${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \
		89	--outfile ${output_kfold}/${k}/train_count_matrix_type.pdf
		90
		91	# This script plot the count matrix of the validation set
		92	python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
		93	${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \
		94	--outfile ${output_kfold}/${k}/val_count_matrix_type.pdf
		95
		96	# Regroup measures with respect to type var
		97	python3 bin/regroup-measures.py exp/kmeans_teacher_1/pvector-1/ --suffix "_type" --measurefile "measures_type.j
43	done	98	done
44	done	99	done
45		100