Quillot Mathias / Clustering

README.md

 # Clustering
 A repository where i put everything dealing with clustering algorithms.
+# How to use
+You can run directly the run.sh script if you want. You just need data.
+You can use some scripts in utils tool, but run these scripts from the root directory "clustering/".
 # TODO
 - Organiser les différentes listes de données pour mes expériences
 - Create a data file example
 # Data
 # File format

bin/regroup-measures.py

Diff comments View file @ e63ab06

 '''
 Regroup results into one file and a plot.
 TODO: Mettre en valeur les valeurs maximales
 TODO: Sauvegarder les valeurs quelques part pour qu'on puisse facilement les retrouver.
 '''
 import numpy as np
 import matplotlib.pyplot as plt
 import argparse
 import os
 import json
 def plot_values_clusters(values, title, xlabel, ylabel):
     values = np.asarray(values)
     x = np.arange(len(values)) + 2
     x_ticks = np.arange(len(values), step=10) + 2
     y = values
     plt.scatter(x, y, s=1)
     plt.xticks(x_ticks)
     plt.title(title)
     plt.xlabel(xlabel)
     plt.ylabel(ylabel)
 def save_plot(filepath):
     plt.savefig(filepath)
     plt.close()
 def save_results(outfile, measures, titles):
     with open(outfile, "w") as f:
         f.write(",".join(titles) + "\n")
         n = len(measures[0])
         for i in range(n):
             f.write(",".join([str(ms[i]) for ms in measures]) + "\n")
 # -- PARSER
 parser = argparse.ArgumentParser(description="")
 parser.add_argument("expdir", type=str, help="Directory of experiment")
+parser.add_argument("--nkfold", type=int, default=4, help="number of kfold")
+parser.add_argument("--nkfoldmin", type=int, default=1, help="Begin with this numero of kfold")
 parser.add_argument("--measurefile", type=str, default="measures.json",
                     help="Measure file it searchs in folders")
 parser.add_argument("--suffix", type=str, default="",
                     help="suffix of saved files")
 args = parser.parse_args()
 EXP_DIR = args.expdir
 MEASURE_FILE = args.measurefile
 SUFFIX = args.suffix
+MAX_KFOLD = args.nkfold
+MIN_KFOLD = args.nkfoldmin
 # EXP_DIR="exp/kmeans_teacher_1/pvector-1"
 RESULTS_DIR = os.path.join(EXP_DIR, "res")
 # -- CONFIG
 kmin = 2
 kmax = 100
 # -- CREATE FOLDER
 if not os.path.exists(RESULTS_DIR):
     os.makedirs(RESULTS_DIR)
 # -- BEGIN REGROUPMENT
 subsets = ["train", "val"]
 disequilibriums = []
 def init_measures():
     measures = {}
     for subset in subsets:
         measures[subset] = {}
         measures[subset]["entropy"] = []
         measures[subset]["vscore"] = []
         measures[subset]["homogeneity"] = []
         measures[subset]["completeness"] = []
     return measures
 measures = init_measures()
-for kfold in range(1, 5):
+for kfold in range(MIN_KFOLD, MAX_KFOLD + 1):
     print("Regrouping on kfold: " + str(kfold))
     # -- REGROUP MEASURES INTO LISTS
     for k in range(kmin, kmax+1):
         measures_file = os.path.join(EXP_DIR, str(kfold), str(k), MEASURE_FILE)
         with open(measures_file, 'r') as f:
             meas_data = json.load(f)
         disequilibriums.append(meas_data["disequilibrium"])
         for subset in subsets:
             measures[subset]["entropy"].append(
                 meas_data[subset]["entropy"])
             measures[subset]["vscore"].append(
                 meas_data[subset]["vscore"])
             measures[subset]["homogeneity"].append(
                 meas_data[subset]["homogeneity"])
             measures[subset]["completeness"].append(
                 meas_data[subset]["completeness"])
     # -- PLOT AND SAVE MEASURES FOR A SPECIFIC SUBSET
     for subset in subsets:
         # Plot all measures
         outf = "measures_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".pdf"
         fig = plt.figure(1)
         for i, measure in enumerate(measures[subset]):
             plt.subplot(220 + i + 1)
             plot_values_clusters(
                 measures[subset][measure],
                 measure.capitalize() + " " + str(subset) + " set " + str(kfold),
                 "N clusters",
                 measure.capitalize())
         plt.subplots_adjust(hspace=0.5, wspace=0.3)
         save_plot(os.path.join(RESULTS_DIR, outf))
         # Save all measures on a csv file
         save_results(
             os.path.join(RESULTS_DIR, "measures_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".csv"),
             [
                 measures[subset]["entropy"],
                 measures[subset]["homogeneity"],
                 measures[subset]["completeness"],
                 measures[subset]["vscore"]
             ],
             [
                 "entropy",
                 "homogeneity",
                 "completeness",
                 "vscore"
             ]
         )
     # PLOT AND SAVE FOR DISEQUILIBRIUM
     plot_values_clusters(
         disequilibriums,
         "Disequilibrium set " + str(kfold),
         "N clusters",
         "Disequilibrium")
     save_plot(os.path.join(RESULTS_DIR, "disequilibrium_" + str(kfold) + str(SUFFIX) + ".pdf"))
     save_results(
         os.path.join(RESULTS_DIR, "disequilibrium_" + str(kfold) + str(SUFFIX) + ".csv"),
         [disequilibriums],
         ["disequilibrium"])
     measures = init_measures()
     disequilibriums = []

bin/replace-features.py

Diff comments View file @ e63ab06

File was created	1
	2	import argparse
	3
	4	from data import read_file, index_by_id, write_line
	5
	6	# -- ARGPARSE
	7	parser = argparse.ArgumentParser(
	8	description="Replace features with file from to file to")
	9	parser.add_argument("fromfile", type=str, help="From list or features file")
	10	parser.add_argument("tofile", type=str, help="Features of 'from' saved into this file.")
	11
	12	args = parser.parse_args()
	13	FROM = args.fromfile
	14	TO = args.tofile
	15
	16
	17	# -- READ AND INDEX FILES
	18	from_data = read_file(FROM)
	19	from_by_id = index_by_id(from_data)
	20
	21	to_data = read_file(TO)
	22
	23	with open(TO, "w") as f:
	24	for line in to_data:
	25	metas = line[0]
	26	features = from_by_id[metas[0]][metas[3]][1]
	27	write_line(metas, features, f)
	28
	29

config/archives/ivector_config.sh

Diff comments View file @ e63ab06

File was created	1	OUTDIR="exp/kmeans_euclidian/ivectors"
	2	DATADIR="data"
	3	NEW_LSTDIR="${OUTDIR}/lst"
	4
	5	VECTOR_FILE="data/ivectors.txt" # To specify if there's only one
	6	VECTOR_FILES_ONE=true # Specify there's only one file
	7
	8	KMIN=2
	9	KMAX=100
	10

config/archives/pv_from_xv_config.sh

Diff comments View file @ e63ab06

File was created	1
	2	# Framework configuration
	3	OUTDIR="exp/kmeans_euclidian/pv_from_xv"
	4	DATADIR="data"
	5	NEW_LSTDIR="${OUTDIR}/lst"
	6
	7	VECTOR_FILES_BEGIN="${DATADIR}/pv_from_xv/me_pv_teacher"
	8	VECTOR_FILES_END=".txt"
	9	VECTOR_FILE="" # To specify if there's only one
	10	VECTOR_FILES_ONE=false # Specify there's only one file
	11
	12	KMIN=2
	13	KMAX=100
	14

config/archives/pvector_config.sh

Diff comments View file @ e63ab06

File was created	1
	2	OUTDIR="exp/kmeans_euclidian/teacher-pvector-1"
	3	DATADIR="data"
	4	NEW_LSTDIR="${OUTDIR}/lst"
	5
	6	VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher"
	7	VECTOR_FILES_END=".txt"
	8	VECTOR_FILE="" # To specify if there's only one
	9	VECTOR_FILES_ONE=false # Specify there's only one file
	10
	11	KMIN=2
	12	KMAX=100
	13

config/archives/pvector_layer1_config.sh

Diff comments View file @ e63ab06

File was created	1	OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer1"
	2	DATADIR="data"
	3	NEW_LSTDIR="${OUTDIR}/lst"
	4
	5	VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_1"
	6	VECTOR_FILES_END=".txt"
	7	VECTOR_FILE="" # To specify if there's only one
	8	VECTOR_FILES_ONE=false # Specify there's only one file
	9
	10	KMIN=2
	11	KMAX=100
	12

config/archives/pvector_layer2_config.sh

Diff comments View file @ e63ab06

File was created	1	OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer2"
	2	DATADIR="data"
	3	NEW_LSTDIR="${OUTDIR}/lst"
	4
	5	VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_2"
	6	VECTOR_FILES_END=".txt"
	7	VECTOR_FILE="" # To specify if there's only one
	8	VECTOR_FILES_ONE=false # Specify there's only one file
	9
	10	KMIN=2
	11	KMAX=100
	12

config/archives/pvector_layer3_config.sh

Diff comments View file @ e63ab06

File was created	1	OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer3"
	2	DATADIR="data"
	3	NEW_LSTDIR="${OUTDIR}/lst"
	4
	5	VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_3"
	6	VECTOR_FILES_END=".txt"
	7	VECTOR_FILE="" # To specify if there's only one
	8	VECTOR_FILES_ONE=false # Specify there's only one file
	9
	10	KMIN=2
	11	KMAX=100
	12

config/archives/pvector_layer4_config.sh

Diff comments View file @ e63ab06

File was created	1	OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer4"
	2	DATADIR="data"
	3	NEW_LSTDIR="${OUTDIR}/lst"
	4
	5	VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_4"
	6	VECTOR_FILES_END=".txt"
	7	VECTOR_FILE="" # To specify if there's only one
	8	VECTOR_FILES_ONE=false # Specify there's only one file
	9
	10	KMIN=2
	11	KMAX=100
	12

config/archives/xvector_config.sh

Diff comments View file @ e63ab06

File was created	1	OUTDIR="exp/kmeans_euclidian/xvectors"
	2	DATADIR="data"
	3	NEW_LSTDIR="${OUTDIR}/lst"
	4
	5	VECTOR_FILE="data/xvectors.txt" # To specify if there's only one
	6	VECTOR_FILES_ONE=true # Specify there's only one file
	7
	8	KMIN=2
	9	KMAX=100
	10

config/config_iv.sh

Diff comments View file @ e63ab06

File was created	1	OUTDIR="exp/kmeans_euclidian/iv"
	2	DATADIR="data"
	3	NEW_LSTDIR="${OUTDIR}/lst"
	4
	5	VECTOR_FILE="data/ivectors.txt" # To specify if there's only one
	6	VECTOR_FILES_ONE=true # Specify there's only one file
	7
	8	METAS_CHARACTER="data/masseffect.lst"
	9	CHAR_INFO="data/masseffect_character_information.csv"
	10
	11	ORIGINAL_VECTOR_FILE="${VECTOR_FILE}"
	12
	13	KMIN=2
	14	KMAX=100
	15
	16

config/config_iv_skyrim.sh

Diff comments View file @ e63ab06

File was created	1	OUTDIR="exp/kmeans_euclidian_skyrim/iv"
	2	DATADIR="data"
	3	NEW_LSTDIR="${OUTDIR}/lst"
	4
	5	VECTOR_FILE="../data/skyrim/skyrim_ivectors.txt" # To specify if there's only one
	6	VECTOR_FILES_ONE=true # Specify there's only one file
	7
	8	METAS_CHARACTER="../data/skyrim/skyrim.lst"
	9	CHAR_INFO="data/skyrim_character_information.csv"
	10
	11	ORIGINAL_VECTOR_FILE="${VECTOR_FILE}"
	12
	13	KMIN=2
	14	KMAX=100
	15
	16

config/config_pv_from_iv.sh

Diff comments View file @ e63ab06

File was created	1
	2	if [ -z "$kfold" ]
	3	then
	4	kfold=1
	5	fi
	6
	7	if [ -z "${t}" ]
	8	then
	9	t=2.0
	10	fi
	11
	12	OUTDIR="exp/kmeans_euclidian/pv_from_iv/${kfold}"
	13	DATADIR="data"
	14	MOTHER_LST_DIR="/local_disk/pegasus/laboinfo/mquillot/vocal_similarity_system/data/prot_alpha"
	15	NEW_LSTDIR="${OUTDIR}/lst"
	16
	17
	18	VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/exp/kd_iv/${kfold}/${t}/teacher/masseffect_pvectors.txt" # To specify if there's only one
	19	VECTOR_FILES_ONE=true # Specify there's only one file
	20	ORIGINAL_VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/data/masseffect.txt"
	21
	22
	23	MIN_KFOLD=${kfold}
	24	MAX_KFOLD=${kfold}
	25
	26	KMIN=2
	27	KMAX=100
	28

config/config_pv_from_xv.sh

Diff comments View file @ e63ab06

File was created	1
	2	if [ -z "$kfold" ]
	3	then
	4	kfold=1
	5	fi
	6
	7	if [ -z "${t}" ]
	8	then
	9	t=2.0
	10	fi
	11
	12	OUTDIR="exp/kmeans_euclidian/pv_from_xv/${kfold}"
	13	DATADIR="data"
	14	MOTHER_LST_DIR="/local_disk/pegasus/laboinfo/mquillot/vocal_similarity_system/data/prot_alpha"
	15	NEW_LSTDIR="${OUTDIR}/lst"
	16
	17
	18	VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/exp/kd_xvectors/${kfold}/${t}/teacher/masseffect_pvectors.txt" # To specify if there's only one
	19	VECTOR_FILES_ONE=true # Specify there's only one file
	20	ORIGINAL_VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/data/masseffect_xvectors.txt"
	21
	22	MIN_KFOLD=${kfold}
	23	MAX_KFOLD=${kfold}
	24
	25	KMIN=2
	26	KMAX=100
	27

config/config_without_kfold_iv.sh

Diff comments View file @ e63ab06

File was created	1	OUTDIR="exp/kmeans_euclidian_skyrim/ivectors"
	2	DATADIR="data"
	3	NEW_LSTDIR="${OUTDIR}/lst"
	4
	5	LST_FILE="/local_disk/pegasus/laboinfo/mquillot/data/skyrim/skyrim_ivectors.txt"
	6	VECTOR_FILE="data/ivectors.txt" # To specify if there's only one
	7	VECTOR_FILES_ONE=true # Specify there's only one file
	8
	9	WITHOUT_KFOLD=""
	10	KMIN=2
	11	KMAX=100
	12
	13	METAS_CHARACTER=""

config/config_xv.sh

Diff comments View file @ e63ab06

File was created	1	OUTDIR="exp/kmeans_euclidian/xv"
	2	DATADIR="data"
	3	NEW_LSTDIR="${OUTDIR}/lst"
	4
	5	VECTOR_FILE="data/xvectors.txt" # To specify if there's only one
	6	VECTOR_FILES_ONE=true # Specify there's only one file
	7
	8	ORIGINAL_VECTOR_FILE="${VECTOR_FILE}"
	9	KMIN=2
	10	KMAX=100
	11

config/ivector_config.sh

View file @ e63ab06

1	OUTDIR="exp/kmeans_euclidian/ivectors"		File was deleted
2	DATADIR="data"
3	NEW_LSTDIR="${OUTDIR}/lst"
4
5	VECTOR_FILE="data/ivectors.txt" # To specify if there's only one
6	VECTOR_FILES_ONE=true # Specify there's only one file
7
8	KMIN=2
9	KMAX=100
10		1	OUTDIR="exp/kmeans_euclidian/ivectors"

config/pv_from_xv_config.sh

View file @ e63ab06

1			File was deleted
2	# Framework configuration
3	OUTDIR="exp/kmeans_euclidian/pv_from_xv"
4	DATADIR="data"
5	NEW_LSTDIR="${OUTDIR}/lst"
6
7	VECTOR_FILES_BEGIN="${DATADIR}/pv_from_xv/me_pv_teacher"
8	VECTOR_FILES_END=".txt"
9	VECTOR_FILE="" # To specify if there's only one
10	VECTOR_FILES_ONE=false # Specify there's only one file
11
12	KMIN=2
13	KMAX=100
14		1

config/pvector_config.sh

View file @ e63ab06

1			File was deleted
2	OUTDIR="exp/kmeans_euclidian/teacher-pvector-1"
3	DATADIR="data"
4	NEW_LSTDIR="${OUTDIR}/lst"
5
6	VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher"
7	VECTOR_FILES_END=".txt"
8	VECTOR_FILE="" # To specify if there's only one
9	VECTOR_FILES_ONE=false # Specify there's only one file
10
11	KMIN=2
12	KMAX=100
13		1

config/pvector_layer1_config.sh

View file @ e63ab06

1	OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer1"		File was deleted
2	DATADIR="data"
3	NEW_LSTDIR="${OUTDIR}/lst"
4
5	VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_1"
6	VECTOR_FILES_END=".txt"
7	VECTOR_FILE="" # To specify if there's only one
8	VECTOR_FILES_ONE=false # Specify there's only one file
9
10	KMIN=2
11	KMAX=100
12		1	OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer1"

config/pvector_layer2_config.sh

View file @ e63ab06

1	OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer2"		File was deleted
2	DATADIR="data"
3	NEW_LSTDIR="${OUTDIR}/lst"
4
5	VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_2"
6	VECTOR_FILES_END=".txt"
7	VECTOR_FILE="" # To specify if there's only one
8	VECTOR_FILES_ONE=false # Specify there's only one file
9
10	KMIN=2
11	KMAX=100
12		1	OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer2"

config/pvector_layer3_config.sh

View file @ e63ab06

1	OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer3"		File was deleted
2	DATADIR="data"
3	NEW_LSTDIR="${OUTDIR}/lst"
4
5	VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_3"
6	VECTOR_FILES_END=".txt"
7	VECTOR_FILE="" # To specify if there's only one
8	VECTOR_FILES_ONE=false # Specify there's only one file
9
10	KMIN=2
11	KMAX=100
12		1	OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer3"

config/pvector_layer4_config.sh

View file @ e63ab06

1	OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer4"		File was deleted
2	DATADIR="data"
3	NEW_LSTDIR="${OUTDIR}/lst"
4
5	VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_4"
6	VECTOR_FILES_END=".txt"
7	VECTOR_FILE="" # To specify if there's only one
8	VECTOR_FILES_ONE=false # Specify there's only one file
9
10	KMIN=2
11	KMAX=100
12		1	OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer4"

config/xvector_config.sh

View file @ e63ab06

1	OUTDIR="exp/kmeans_euclidian/xvectors"		File was deleted
2	DATADIR="data"
3	NEW_LSTDIR="${OUTDIR}/lst"
4
5	VECTOR_FILE="data/xvectors.txt" # To specify if there's only one
6	VECTOR_FILES_ONE=true # Specify there's only one file
7
8	KMIN=2
9	KMAX=100
10		1	OUTDIR="exp/kmeans_euclidian/xvectors"

extract-labels-pv-from-xv.sh

View file @ e63ab06

1			File was deleted
2
3	# Number of set
4	k=4
5
6
7	# Vector features file
8	DATADIR="data"
9
10	VECTOR_FILE_MASSEFFECT="${DATADIR}/xvectors.txt"
11
12	for kmean in 12 41 45 50 6 69 72 88
13	do
14	echo "KMEAN: ${kmean}"
15	# Dirs
16	EXP_DIR="exp/kmeans_euclidian/pv_from_xv/${k}/${kmean}"
17	CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl"
18
19
20	# Output dirs
21	OUTFILE_MASSEFFECT="data/pv_from_xv/saved_clustered/masseffect_clustered_${k}_${kmean}.txt"
22	echo "Extracting"
23	python3 bin/extract_kmeans.py "${CLUSTERING}" \
24	"${VECTOR_FILE_MASSEFFECT}" \
25	--outfile "$OUTFILE_MASSEFFECT"
26	echo "End extracting"
27	done
28		1

extract-labels.sh

View file @ e63ab06

1			File was deleted
2
3	# Number of set
4	k=4
5	kmean=88
6
7
8	# Vector features file
9	VECTOR_FILE_MASSEFFECT="data/xvectors.txt"
10
11
12	# Dirs
13	EXP_DIR="exp/kmeans_euclidian/xvectors/${k}/${kmean}"
14	CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl"
15
16
17	# Output dirs
18	OUTFILE_MASSEFFECT="data/xvectors/saved_clustered/masseffect_clustered_xvectors_${k}_${kmean}.txt"
19
20	python3 bin/extract_kmeans.py "${CLUSTERING}" \
21	"${VECTOR_FILE_MASSEFFECT}" \
22	--outfile "$OUTFILE_MASSEFFECT"
23		1

rm-unused-files.sh

View file @ e63ab06

1			File was deleted
2	if [ $# -eq 1 ]
3	then
4	EXP_DIR="$1"
5	else
6	echo "Need to have one and only one argument. This argument is the exp directory."
7	exit 1
8	fi
9
10	for kfold in {1..4}
11	do
12	for k in {1..100}
13	do
14	rm ${EXP_DIR}/$kfold/$k/clustered_$k.txt
15	done
16	done
17		1

run-clustering.sh

Diff comments View file @ e63ab06

1	#	1	#
2	# This script aims to compute clustering	2	# This script aims to compute clustering
3	#	3	#
4		4
5		5
6	# -- CONFIGURATION	6	# -- CONFIGURATION
7	# THIS SCRIPT NEEDS THESE VARIABLES	7	# THIS SCRIPT NEEDS THESE VARIABLES
8	# Vector file	8	# Vector file
9	#VECTOR_FILE=""	9	#VECTOR_FILE=""
10	# Train list	10	# Train list
11	#TRAIN_LST==""	11	#TRAIN_LST==""
12	# Val list	12	# Val list
13	#VAL_LST=""	13	#VAL_LST=""
14	# Exp directory	14	# Exp directory
15	#EXP_DIR=""	15	#EXP_DIR=""
16	# Metas file with type values	16	# Metas file with type values
17	#METAS_TYPE=""	17	#METAS_TYPE=""
18	# Metas file with character values	18	# Metas file with character values
19	#METAS_CHARACTER=""	19	#METAS_CHARACTER=""
20		20
21		21
22	#echo "VECTOR FILE: $VECTOR_FILE"	22	#echo "VECTOR FILE: $VECTOR_FILE"
23	#echo "TRAIN LIST: $TRAIN_LST"	23	#echo "TRAIN LIST: $TRAIN_LST"
24	#echo "VAL LIST: $VAL_LST"	24	#echo "VAL LIST: $VAL_LST"
25	#echo "EXP DIR: $EXP_DIR"	25	#echo "EXP DIR: $EXP_DIR"
26	#echo "METAS TYPE: $METAS_TYPE"	26	#echo "METAS TYPE: $METAS_TYPE"
27	#echo "METAS_CHARACTER: $METAS_CHARACTER"	27	#echo "METAS_CHARACTER: $METAS_CHARACTER"
28		28
29		29
30		30
31	# -- TRAIN KMEANS	31	# -- TRAIN KMEANS
32	echo "Clustering - ${kfold}"	32	echo "Clustering - ${kfold}"sss
33	python3 bin/cluster_kmeans.py "${VECTOR_FILE}" \	33	python3 bin/cluster_kmeans.py "${VECTOR_FILE}" \
34	"${TRAIN_LST}" \	34	"${TRAIN_LST}" \
35	"${EXP_DIR}" --kmin ${KMIN} --kmax ${KMAX}	35	"${EXP_DIR}" --kmin ${KMIN} --kmax ${KMAX}
36		36
37		37
38		38
39	for k in $(seq ${KMIN} 1 ${KMAX})	39	for k in $(seq ${KMIN} 1 ${KMAX})
40	do	40	do
41	SUB_EXP_DIR="${EXP_DIR}/${k}"	41	SUB_EXP_DIR="${EXP_DIR}/${k}"
42		42
43	# -- EXTRACT KMEANS VALUES	43	# -- EXTRACT KMEANS VALUES
44	echo "Kmeans Measuring and extraction - ${k}"	44	echo "Kmeans Measuring and extraction - ${k}"
45	python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \	45	python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \
46	"${VECTOR_FILE}" \	46	"${VECTOR_FILE}" \
47	--outfile "${SUB_EXP_DIR}/clustered_${k}.txt"	47	--outfile "${SUB_EXP_DIR}/clustered_${k}.txt"
48	# -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR	48	# -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR
49	# Measures	49	# Measures
50	python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \	50	python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
51	"${METAS_CHARACTER}" \	51	"${METAS_CHARACTER}" \
52	"${TRAIN_LST}" \	52	"${TRAIN_LST}" \
53	"${VAL_LST}" \	53	"${VAL_LST}" \
54	--outfile "${SUB_EXP_DIR}/measures.json"	54	--outfile "${SUB_EXP_DIR}/measures.json"
55		55
56	# Plot count matrix for train	56	# Plot count matrix for train
57	python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \	57	python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
58	${VECTOR_FILE} \	58	${VECTOR_FILE} \
59	${TRAIN_LST} \	59	${TRAIN_LST} \
60	--outfile "${SUB_EXP_DIR}/train_count_matrix.pdf"	60	--outfile "${SUB_EXP_DIR}/train_count_matrix.pdf"
61		61
62	# Plot count matrix for val	62	# Plot count matrix for val
63	python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \	63	python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
64	${VECTOR_FILE} \	64	${VECTOR_FILE} \
65	${VAL_LST} \	65	${VAL_LST} \
66	--outfile "${SUB_EXP_DIR}/val_count_matrix.pdf"	66	--outfile "${SUB_EXP_DIR}/val_count_matrix.pdf"
67		67
68	# -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR	68	# -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR
69	# Measures	69	# Measures
70	python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \	70	python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
71	"${METAS_TYPE}" \	71	"${METAS_TYPE}" \
72	"${TRAIN_LST}" \	72	"${TRAIN_LST}" \
73	"${VAL_LST}" \	73	"${VAL_LST}" \
74	--outfile "${SUB_EXP_DIR}/measures_type.json"	74	--outfile "${SUB_EXP_DIR}/measures_type.json"
75		75
76	# This script plot the count matrix of the train set	76	# This script plot the count matrix of the train set
77	python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \	77	python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
78	"${METAS_TYPE}" \	78	"${METAS_TYPE}" \
79	"${TRAIN_LST}" \	79	"${TRAIN_LST}" \
80	--outfile "${SUB_EXP_DIR}/train_count_matrix_type.pdf"	80	--outfile "${SUB_EXP_DIR}/train_count_matrix_type.pdf"
81		81
82	# This script plot the count matrix of the validation set	82	# This script plot the count matrix of the validation set
83	python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \	83	python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
84	"${METAS_TYPE}" \	84	"${METAS_TYPE}" \
85	"${VAL_LST}" \	85	"${VAL_LST}" \
86	--outfile "${SUB_EXP_DIR}/val_count_matrix_type.pdf"	86	--outfile "${SUB_EXP_DIR}/val_count_matrix_type.pdf"
87		87
88		88
89	# -- MEASURES AND PLOT WITH RESPECT TO LANG VAR	89	# -- MEASURES AND PLOT WITH RESPECT TO LANG VAR
90	# Measures	90	# Measures
91	python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \	91	python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
92	"${METAS_LANG}" \	92	"${METAS_LANG}" \
93	"${TRAIN_LST}" \	93	"${TRAIN_LST}" \
94	"${VAL_LST}" \	94	"${VAL_LST}" \
95	--outfile "${SUB_EXP_DIR}/measures_lang.json"	95	--outfile "${SUB_EXP_DIR}/measures_lang.json"
96		96
97	# This script plot the count matrix of the train set	97	# This script plot the count matrix of the train set
98	python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \	98	python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
99	"${METAS_LANG}" \	99	"${METAS_LANG}" \
100	"${TRAIN_LST}" \	100	"${TRAIN_LST}" \
101	--outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf"	101	--outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf"
102		102
103	# This script plot the count matrix of the validation set	103	# This script plot the count matrix of the validation set
104	python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \	104	python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
105	"${METAS_LANG}" \	105	"${METAS_LANG}" \
106	"${VAL_LST}" \	106	"${VAL_LST}" \
107	--outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf"	107	--outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf"
108		108
109	done	109	done
110		110
111		111

run-measures.sh

Diff comments View file @ e63ab06

 # Pour le moment, le run ne fait qu'executer
 # quelques petites commandes que l'on souhaite
 # tester.
-OUTDIR="exp/kmeans_euclidian/teacher-pvector-1"
+set -e
+OUTDIR="exp/kmeans_euclidian/ivectors"
 EXP_DIR=${OUTDIR}
 DATADIR="data"
 NEW_LSTDIR="${OUTDIR}/lst"
 kmin=2
 kmax=100
 if [ ! -d "$OUTDIR" ];
 then
     mkdir -p $OUTDIR
 fi
 if [ ! -d "$NEW_LSTDIR" ];
 then
     mkdir -p $NEW_LSTDIR
 fi
 for kfold in {1..4}
 do
-    pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt"
+    #pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt"
-    VECTOR_FILE=$pvector_file
+    VECTOR_FILE="${DATADIR}/ivectors.txt"
     lst_dir="${DATADIR}/pvectors_1rst/lst"
     output_kfold="${OUTDIR}/${kfold}"
     #python3 "bin/replace_label.py" \
     #    "${DATADIR}/masseffect.lst" \
     #    "${DATADIR}/character_information.csv" \
     #    --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \
     #    --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst"
     #python3 "bin/replace_label.py" \
     #    "${DATADIR}/masseffect.lst" \
     #    "${DATADIR}/character_information.csv" \
     #    --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \
     #    --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst"
     #cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst"
     TRAIN_LST=${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst
     VAL_LST=${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst
     TRAIN_LANG_LST=${NEW_LSTDIR}/train_${kfold}_lang.lst
     VAL_LANG_LST=${NEW_LSTDIR}/val_${kfold}_lang.lst
     METAS_LANG=${NEW_LSTDIR}/metas_${kfold}_lang.lst
     # EXTRACT LANGUAGE INFORMATION
     awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST}
     echo "VAL EXTRACT LANGUAGE INFO DONE"
     awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST}
     echo "TRAIN EXTRACT LANGUAGE INFO DONE"
     cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}"
     echo "GLOBAL EXTRACT LANGUAGE INFO DONE"
     echo "Clustering - ${kfold}"
     for k in $(seq ${kmin} 1 ${kmax})
     do
         echo "Kmeans Measuring and ploting - ${k}"
-	SUB_EXP_DIR="${EXP_DIR}/${kfold}/${k}"
+        SUB_EXP_DIR="${EXP_DIR}/${kfold}/${k}"
-	# -- EXTRACT CLUSTERING LABELS
+        # -- EXTRACT CLUSTERING LABELS
-	python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \
+        python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \
-        "${VECTOR_FILE}" \
+            "${VECTOR_FILE}" \
-        --outfile "${SUB_EXP_DIR}/clustered_${k}.txt"
+            --outfile "${SUB_EXP_DIR}/clustered_${k}.txt"
-	# -- MEASURES AND PLOT WITH RESPECT TO LANG VAR
+        # -- MEASURES AND PLOT
-        # Measures
+        source steps/measure_clustering_char.sh
-        python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
+        source steps/measure_clustering_type.sh
-            "${METAS_LANG}" \
+        source steps/measure_clustering_lang.sh
-            "${TRAIN_LST}" \
-            "${VAL_LST}" \
-            --outfile "${SUB_EXP_DIR}/measures_lang.json"
-        # This script plot the count matrix of the train set
+        rm ${SUB_EXP_DIR}/clustered_${k}.txt
-        python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
-            "${METAS_LANG}" \
-            "${TRAIN_LST}" \
-            --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf"
-        # This script plot the count matrix of the validation set
-        python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
-            "${METAS_LANG}" \
-            "${VAL_LST}" \
-             --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf"
-	rm ${SUB_EXP_DIR}/clustered_${k}.txt
-        #python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \
-        #    "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \
-        #    "${lst_dir}/val_${kfold}.lst" \
-        #    --outfile "${output_kfold}/${k}/measures_type.json"
-        # This script plot the count matrix of the train set
-        #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
-        #    ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \
-        #    --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf
-        # This script plot the count matrix of the validation set
-        #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
-        #    ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \
-        #    --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf
-        # This script plot the count matrix of the train set
-        #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
-        #    ${pvector_file} ${lst_dir}/train_${kfold}.lst \
-        #    --outfile ${output_kfold}/${k}/train_count_matrix.pdf
-        # This script plot the count matrix of the validation set
-        #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
-        #    ${pvector_file} ${lst_dir}/val_${kfold}.lst \
-        #    --outfile ${output_kfold}/${k}/val_count_matrix.pdf
     done
 done

run-skyrim.sh

Diff comments View file @ e63ab06

	File was created	1	python bin/cluster_kmeans.py ../data/skyrim/skyrim_ivectors.txt ../data/skyrim/skyrim.lst exp/kmeans_euclidian_skyrim/ivectors/ --kmin 1 --kmax 100
		2

run.sh

Diff comments View file @ e63ab06

 #OUTDIR="exp/test/pvector-2"
 #DATADIR="data"
 #NEW_LSTDIR="${OUTDIR}/lst"
 #VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher"
 #VECTOR_FILES_END=".txt"
 #VECTOR_FILE="" # To specify if there's only one
 #VECTOR_FILES_ONE=false # Specify there's only one file
 #KMIN=2
 #KMAX=100
 # -- LOAD CONFIG FILE
 CONFIG_FILE="config.sh"
 if [ $# -eq 1 ]
 then
     CONFIG_FILE="$1"
 else
     echo "Need to have one and only one argument"
     exit -1
 fi
 source $CONFIG_FILE
 # -- DEFAULTS VALUES CONFIGURATION
 if [ -z "$VECTOR_FILES_ONE" ]
 then
     VECTOR_FILES_ONE=false
 fi
+if [ -z "$METAS_CHARACTER" ]
+then
+    METAS_CHARACTER="${DATADIR}/masseffect.lst"
+fi
+if [ -z "$CHAR_INFO" ]
+then
+    CHAR_INFO="${DATADIR}/character_information.csv"
+fi
 # -- MAKE DIRECTORIES
 if [ ! -d "$OUTDIR" ];
 then
     mkdir -p $OUTDIR
 fi
 if [ ! -d "${NEW_LSTDIR}" ];
 then
     mkdir -p ${NEW_LSTDIR}
 fi
 # -- KFOLD MIN and MAX
 if [ -z "$MIN_KFOLD" ]
 then
     MIN_KFOLD=1
 fi
 if [ -z "$MAX_KFOLD" ]
 then
     MAX_KFOLD=4
 fi
 # -- BEGIN BY KFOLD
 for kfold in $(seq ${MIN_KFOLD} ${MAX_KFOLD})
 do
     # Some usefull variable
-    CHAR_INFO="${DATADIR}/character_information.csv"
     TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst"
     VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst"
     TRAIN_LANG_LST="${NEW_LSTDIR}/train_${kfold}_lang.lst"
     VAL_LANG_LST="${NEW_LSTDIR}/val_${kfold}_lang.lst"
     # Configuration for the run clustering file
     if [ ${VECTOR_FILES_ONE} == false ]
     then
         VECTOR_FILE="${VECTOR_FILES_BEGIN}_${kfold}${VECTOR_FILES_END}"
     fi
-    TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst"
+    TRAIN_LST="${MOTHER_LST_DIR}/lst/train_${kfold}.lst"
-    VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst"
+    VAL_LST="${MOTHER_LST_DIR}/lst/val_${kfold}.lst"
     EXP_DIR="${OUTDIR}/${kfold}"
     METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst"
-    METAS_CHARACTER="${DATADIR}/masseffect.lst"
     METAS_LANG="${NEW_LSTDIR}/metas_${kfold}_lang.lst"
     if [ ! -d "${EXP_DIR}" ];
     then
         mkdir -p ${EXP_DIR}
     fi
     # EXTRACT TYPE INFORMATION
     echo "Extracting character information"
     echo "Replace in train"
     python3 "bin/replace_label.py" \
         "${METAS_CHARACTER}" \
         "${CHAR_INFO}" \
         --field "type" \
         --lst "${TRAIN_LST}" \
         --outfile "${TRAIN_TYPE_LST}"
     echo "Replace in val"
     python3 "bin/replace_label.py" \
         "${METAS_CHARACTER}" \
         "${CHAR_INFO}" \
         --field "type" \
         --lst "${VAL_LST}" \
         --outfile "${VAL_TYPE_LST}"
     echo "Merge them"
     cat "${TRAIN_TYPE_LST}" "${VAL_TYPE_LST}" > "${METAS_TYPE}"
     # EXTRACT LANGUAGE INFORMATION
     echo "Language info for train"
     awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST}
     echo "Language info for val"
     awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST}
     echo "Merge them"
     cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}"
     echo "Then Run Clustering"
     source "run-clustering.sh"
 done
 # Regroup measures with respect to character classes
 echo "Regrouping measures with respect to character classes"
 python3 "bin/regroup-measures.py" ${OUTDIR}
 # Regroup measures with respect to type classes
 echo "Regrouping measures with respect to type classes"
 python3 "bin/regroup-measures.py" ${OUTDIR} --suffix "_type" --measurefile "measures_type.json"

run_kfold.sh

Diff comments View file @ e63ab06

File was created	1
	2	for kfold in `seq 1 4`
	3	do
	4	echo "KFOLD: ${kfold}"
	5	source run.sh
	6	done
	7
	8

run_without_kfold.sh

Diff comments View file @ e63ab06

File was created	1
	2	for k in $(seq ${KMIN} 1 ${KMAX})
	3	do
	4	SUB_EXP_DIR="${EXP_DIR}/${k}"
	5
	6	# -- EXTRACT KMEANS VALUES
	7	echo "Kmeans Measuring and extraction - ${k}"
	8	python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \
	9	"${VECTOR_FILE}" \
	10	--outfile "${SUB_EXP_DIR}/clustered_${k}.txt"
	11
	12	python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
	13	"${METAS_CHARACTER}" \
	14	"${TRAIN_LST}" \
	15	"${VAL_LST}" \
	16	--outfile "${SUB_EXP_DIR}/measures.json"

steps/extract_cluster_file.sh

Diff comments View file @ e63ab06

File was created	1
	2	for kfold in `seq 1 4`
	3	do
	4	source $1
	5	vector_file=${VECTOR_FILE}
	6	echo "kfold: $kfold"
	7	for kmean in `seq 2 100`
	8	do
	9	echo "kmean: $kmean"
	10	exp_dir="${OUTDIR}/${kfold}/${kmean}"
	11	clustering="${exp_dir}/clustering_${kmean}.pkl"
	12	save_loc="${exp_dir}"
	13	saved_txt="${save_loc}/masseffect_clustered.txt"
	14	saved_lst="${save_loc}/masseffect_clustered.lst"
	15
	16	python3 bin/extract_kmeans.py "${clustering}" \
	17	"${vector_file}" \
	18	--outfile "${saved_txt}"
	19
	20	cat ${saved_txt} \| cut -d" " -f1 > ${saved_lst}
	21
	22	python3 bin/replace-features.py "${ORIGINAL_VECTOR_FILE}" "${saved_txt}"
	23	done
	24	done
	25

steps/extract_cluster_file_skyrim.sh

Diff comments View file @ e63ab06

File was created	1
	2	source $1
	3	vector_file=${VECTOR_FILE}
	4	echo "kfold: $kfold"
	5	for kmean in `seq 2 100`
	6	do
	7	echo "kmean: $kmean"
	8	exp_dir="${OUTDIR}/${kmean}"
	9	clustering="${exp_dir}/clustering_${kmean}.pkl"
	10	save_loc="${exp_dir}"
	11	saved_txt="${save_loc}/masseffect_clustered.txt"
	12	saved_lst="${save_loc}/masseffect_clustered.lst"
	13
	14	python3 bin/extract_kmeans.py "${clustering}" \
	15	"${vector_file}" \
	16	--outfile "${saved_txt}"
	17
	18	cat ${saved_txt} \| cut -d" " -f1 > ${saved_lst}
	19
	20	python3 bin/replace-features.py "${ORIGINAL_VECTOR_FILE}" "${saved_txt}"
	21	done
	22
	23

steps/extract_language_lst.sh

Diff comments View file @ e63ab06

File was created	1	DATADIR="data"
	2	OUTDIR="exp/kmeans_euclidian/ivectors"
	3	NEW_LSTDIR="${OUTDIR}/lst"
	4
	5	TRAIN_LST=${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst
	6	VAL_LST=${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst
	7	TRAIN_LANG_LST=${NEW_LSTDIR}/train_${kfold}_lang.lst
	8	VAL_LANG_LST=${NEW_LSTDIR}/val_${kfold}_lang.lst
	9	METAS_LANG=${NEW_LSTDIR}/metas_${kfold}_lang.lst
	10
	11
	12	awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST}
	13	echo "VAL EXTRACT LANGUAGE INFO DONE"
	14	awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST}
	15	echo "TRAIN EXTRACT LANGUAGE INFO DONE"
	16	cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}"
	17	echo "GLOBAL EXTRACT LANGUAGE INFO DONE"

steps/measure_clustering_char.sh

Diff comments View file @ e63ab06

File was created	1
	2	python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \
	3	"${lst_dir}/trainval_${kfold}.lst" "${lst_dir}/train_${kfold}.lst" \
	4	"${lst_dir}/val_${kfold}.lst" \
	5	--outfile "${output_kfold}/${k}/measures.json"
	6
	7
	8	# This script plot the count matrix of the train set
	9	python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
	10	"${lst_dir}/train_${kfold}.lst" \
	11	"${lst_dir}/train_${kfold}.lst" \
	12	--outfile "${SUB_EXP_DIR}/train_count_matrix.pdf"
	13
	14	# This script plot the count matrix of the validation set
	15	python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
	16	"${lst_dir}/val_${kfold}.lst" \
	17	"${lst_dir}/val_${kfold}.lst" \
	18	--outfile "${SUB_EXP_DIR}/val_count_matrix.pdf"
	19

steps/measure_clustering_lang.sh

Diff comments View file @ e63ab06

File was created	1
	2	python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
	3	"${METAS_LANG}" \
	4	"${TRAIN_LST}" \
	5	"${VAL_LST}" \
	6	--outfile "${SUB_EXP_DIR}/measures_lang.json"
	7
	8	# This script plot the count matrix of the train set
	9	python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
	10	"${METAS_LANG}" \
	11	"${TRAIN_LST}" \
	12	--outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf"
	13
	14	# This script plot the count matrix of the validation set
	15	python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
	16	"${METAS_LANG}" \
	17	"${VAL_LST}" \
	18	--outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf"

steps/measure_clustering_type.sh

Diff comments View file @ e63ab06

File was created	1	python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \
	2	"${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \
	3	"${lst_dir}/val_${kfold}.lst" \
	4	--outfile "${output_kfold}/${k}/measures_type.json"
	5
	6	# This script plot the count matrix of the train set
	7	python3 bin/plot-count-matrix.py "${output_kfold}/${k}/clustered_${k}.txt" \
	8	"${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \
	9	--outfile "${output_kfold}/${k}/train_count_matrix_type.pdf"
	10
	11	# This script plot the count matrix of the validation set
	12	python3 bin/plot-count-matrix.py "${output_kfold}/${k}/clustered_${k}.txt" \
	13	"${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/val_${kfold}.lst" \
	14	--outfile "${output_kfold}/${k}/val_count_matrix_type.pdf"
	15

steps/save_clusters_file.sh

Diff comments View file @ e63ab06

File was created	1
	2	vector_file="data/xvectors.txt"
	3
	4	for kfold in `seq 1 4`
	5	do
	6	echo "kfold: $kfold"
	7	for kmean in `seq 2 100`
	8	do
	9	echo "kmean: $kmean"
	10	exp_dir="exp/kmeans_euclidian/xvectors/${kfold}/${kmean}"
	11	clustering="${exp_dir}/clustering_${kmean}.pkl"
	12	save_loc="data/xvectors/saved_clustered/"
	13	saved_txt="${save_loc}/masseffect_clustered_xvectors_${kfold}_${kmean}.txt"
	14	saved_lst="${save_loc}/masseffect_clustered_xvectors_${kfold}_${kmean}.lst"
	15
	16	python3 bin/extract_kmeans.py "${clustering}" \
	17	"${vector_file}" \
	18	--outfile "${saved_txt}"
	19
	20	cat ${saved_txt} \| cut -d" " -f1 > ${saved_lst}
	21	done
	22	done
	23

utils/extract-labels.sh

Diff comments View file @ e63ab06

File was created	1
	2
	3	# Number of set
	4	k=4
	5	kmean=88
	6
	7
	8	# Vector features file
	9	VECTOR_FILE_MASSEFFECT="data/xvectors.txt"
	10
	11
	12	# Dirs
	13	EXP_DIR="exp/kmeans_euclidian/xvectors/${k}/${kmean}"
	14	CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl"
	15
	16
	17	# Output dirs
	18	OUTFILE_MASSEFFECT="data/xvectors/saved_clustered/masseffect_clustered_xvectors_${k}_${kmean}.txt"
	19
	20	python3 bin/extract_kmeans.py "${CLUSTERING}" \
	21	"${VECTOR_FILE_MASSEFFECT}" \
	22	--outfile "$OUTFILE_MASSEFFECT"
	23

utils/rm-unused-files.sh

Diff comments View file @ e63ab06

File was created	1
	2	if [ $# -eq 1 ]
	3	then
	4	EXP_DIR="$1"
	5	else
	6	echo "Need to have one and only one argument. This argument is the exp directory."
	7	exit 1
	8	fi
	9
	10	for kfold in {1..4}
	11	do
	12	for k in {1..100}
	13	do
	14	rm ${EXP_DIR}/$kfold/$k/clustered_$k.txt
	15	done
	16	done
	17

utils/transform_exp_to_kd.sh

Diff comments View file @ e63ab06

File was created	1
	2	# -- DESCRIPTION --
	3	#
	4	# This script aims to transform data in a shape that is
	5	# usable mainly by knowledge distillation scripts.
	6	#
	7	# Firstly, it extracts clustering labels
	8	# then change features with the given one
	9	# and finally generate a list file.
	10	#
	11	# The pair features files and list file will be usable
	12	# by the knowledge distillation system.
	13	# --------------------
	14
	15
	16	# -- CONFIGURATION --
	17	# Configuration error
	18	set -e
	19
	20	# KFOLD config
	21	MIN_KFOLD=1
	22	MAX_KFOLD=4
	23
	24	# KMEAN config
	25	MIN_KMEAN=2
	26	MAX_KMEAN=100
	27
	28	# Vector features file
	29	DATADIR="data"
	30	FEATURES_DIR="${DATADIR}/pv_from_xv"
	31	FEATURES_PREFIX="me_pv_teacher"
	32	FEATURES_SUFFIX=".txt"
	33
	34	EXP_DIR="exp/kmeans_euclidian/pv_from_xv"
	35	VECTOR_FILE_MASSEFFECT="${DATADIR}/xvectors.txt"
	36	OUTDIR="data/pv_from_xv/saved_clustered"
	37
	38	# -- CREATE DIRECTORIES
	39	# OUTPUT DIRECTORY
	40	if [ ! -d "${OUTDIR}" ]
	41	then
	42	mkdir -p ${OUTDIR}
	43	fi
	44
	45
	46	# -- FUNCTIONS --
	47	# Definition of the transform function
	48	function transform() {
	49	# Define subdir variable
	50	local SUB_EXP_DIR="${EXP_DIR}/${k}/${kmean}"
	51
	52	# Define features file variable
	53	local INITIAL_VECTOR_FILE="${FEATURES_DIR}/${FEATURES_PREFIX}_${k}${FEATURES_SUFFIX}"
	54
	55	# Information of the current process
	56	echo "[KFOLD, KMEAN]: [${k}, ${kmean}]"
	57
	58	# Define clustering model variable
	59	local CLUSTERING="${SUB_EXP_DIR}/clustering_${kmean}.pkl"
	60
	61
	62	# Define output file
	63	local OUTFILE_MASSEFFECT="${OUTDIR}/masseffect_clustered_${k}_${kmean}.txt"
	64
	65	# Extracting clustering labels
	66	echo "Extracting clustering labels"
	67	python3 bin/extract_kmeans.py "${CLUSTERING}" \
	68	"${INITIAL_VECTOR_FILE}" \
	69	--outfile "${OUTFILE_MASSEFFECT}"
	70
	71	# Changing features
	72	echo "Changing features"
	73	python bin/replace-features.py ${VECTOR_FILE_MASSEFFECT} ${OUTFILE_MASSEFFECT}
	74
	75	# Extracting list file
	76	cut -d' ' -f1 ${OUTFILE_MASSEFFECT} > "${OUTDIR}/masseffect_clustered_${k}_${kmean}.lst"
	77	}
	78
	79
	80	# -- MAIN LOOPS
	81	for k in $(seq ${MIN_KFOLD} ${MAX_KFOLD})
	82	do
	83	for kmean in $(seq ${MIN_KMEAN} ${MAX_KMEAN})
	84	do
	85	transform
	86	done
	87	done
	88

GITLAB

Quillot Mathias / Clustering

New organisation of the project