Commit e63ab06fc786597258d861e68c335de9e2afceb4

Authored by Mathias Quillot
1 parent c95c2bf75c
Exists in master

New organisation of the project

Showing 44 changed files with 544 additions and 210 deletions Side-by-side Diff

1 1 # Clustering
2 2 A repository where i put everything dealing with clustering algorithms.
3 3  
  4 +# How to use
  5 +You can run directly the run.sh script if you want. You just need data.
  6 +
  7 +You can use some scripts in utils tool, but run these scripts from the root directory "clustering/".
  8 +
4 9 # TODO
5 10 - Organiser les différentes listes de données pour mes expériences
6 11 - Create a data file example
bin/regroup-measures.py
... ... @@ -40,6 +40,8 @@
40 40 # -- PARSER
41 41 parser = argparse.ArgumentParser(description="")
42 42 parser.add_argument("expdir", type=str, help="Directory of experiment")
  43 +parser.add_argument("--nkfold", type=int, default=4, help="number of kfold")
  44 +parser.add_argument("--nkfoldmin", type=int, default=1, help="Begin with this numero of kfold")
43 45 parser.add_argument("--measurefile", type=str, default="measures.json",
44 46 help="Measure file it searchs in folders")
45 47 parser.add_argument("--suffix", type=str, default="",
... ... @@ -49,6 +51,8 @@
49 51 EXP_DIR = args.expdir
50 52 MEASURE_FILE = args.measurefile
51 53 SUFFIX = args.suffix
  54 +MAX_KFOLD = args.nkfold
  55 +MIN_KFOLD = args.nkfoldmin
52 56  
53 57 # EXP_DIR="exp/kmeans_teacher_1/pvector-1"
54 58 RESULTS_DIR = os.path.join(EXP_DIR, "res")
... ... @@ -83,7 +87,7 @@
83 87  
84 88 measures = init_measures()
85 89  
86   -for kfold in range(1, 5):
  90 +for kfold in range(MIN_KFOLD, MAX_KFOLD + 1):
87 91 print("Regrouping on kfold: " + str(kfold))
88 92 # -- REGROUP MEASURES INTO LISTS
89 93 for k in range(kmin, kmax+1):
bin/replace-features.py
  1 +
  2 +import argparse
  3 +
  4 +from data import read_file, index_by_id, write_line
  5 +
  6 +# -- ARGPARSE
  7 +parser = argparse.ArgumentParser(
  8 + description="Replace features with file from to file to")
  9 +parser.add_argument("fromfile", type=str, help="From list or features file")
  10 +parser.add_argument("tofile", type=str, help="Features of 'from' saved into this file.")
  11 +
  12 +args = parser.parse_args()
  13 +FROM = args.fromfile
  14 +TO = args.tofile
  15 +
  16 +
  17 +# -- READ AND INDEX FILES
  18 +from_data = read_file(FROM)
  19 +from_by_id = index_by_id(from_data)
  20 +
  21 +to_data = read_file(TO)
  22 +
  23 +with open(TO, "w") as f:
  24 + for line in to_data:
  25 + metas = line[0]
  26 + features = from_by_id[metas[0]][metas[3]][1]
  27 + write_line(metas, features, f)
config/archives/ivector_config.sh
  1 +OUTDIR="exp/kmeans_euclidian/ivectors"
  2 +DATADIR="data"
  3 +NEW_LSTDIR="${OUTDIR}/lst"
  4 +
  5 +VECTOR_FILE="data/ivectors.txt" # To specify if there's only one
  6 +VECTOR_FILES_ONE=true # Specify there's only one file
  7 +
  8 +KMIN=2
  9 +KMAX=100
config/archives/pv_from_xv_config.sh
  1 +
  2 +# Framework configuration
  3 +OUTDIR="exp/kmeans_euclidian/pv_from_xv"
  4 +DATADIR="data"
  5 +NEW_LSTDIR="${OUTDIR}/lst"
  6 +
  7 +VECTOR_FILES_BEGIN="${DATADIR}/pv_from_xv/me_pv_teacher"
  8 +VECTOR_FILES_END=".txt"
  9 +VECTOR_FILE="" # To specify if there's only one
  10 +VECTOR_FILES_ONE=false # Specify there's only one file
  11 +
  12 +KMIN=2
  13 +KMAX=100
config/archives/pvector_config.sh
  1 +
  2 +OUTDIR="exp/kmeans_euclidian/teacher-pvector-1"
  3 +DATADIR="data"
  4 +NEW_LSTDIR="${OUTDIR}/lst"
  5 +
  6 +VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher"
  7 +VECTOR_FILES_END=".txt"
  8 +VECTOR_FILE="" # To specify if there's only one
  9 +VECTOR_FILES_ONE=false # Specify there's only one file
  10 +
  11 +KMIN=2
  12 +KMAX=100
config/archives/pvector_layer1_config.sh
  1 +OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer1"
  2 +DATADIR="data"
  3 +NEW_LSTDIR="${OUTDIR}/lst"
  4 +
  5 +VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_1"
  6 +VECTOR_FILES_END=".txt"
  7 +VECTOR_FILE="" # To specify if there's only one
  8 +VECTOR_FILES_ONE=false # Specify there's only one file
  9 +
  10 +KMIN=2
  11 +KMAX=100
config/archives/pvector_layer2_config.sh
  1 +OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer2"
  2 +DATADIR="data"
  3 +NEW_LSTDIR="${OUTDIR}/lst"
  4 +
  5 +VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_2"
  6 +VECTOR_FILES_END=".txt"
  7 +VECTOR_FILE="" # To specify if there's only one
  8 +VECTOR_FILES_ONE=false # Specify there's only one file
  9 +
  10 +KMIN=2
  11 +KMAX=100
config/archives/pvector_layer3_config.sh
  1 +OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer3"
  2 +DATADIR="data"
  3 +NEW_LSTDIR="${OUTDIR}/lst"
  4 +
  5 +VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_3"
  6 +VECTOR_FILES_END=".txt"
  7 +VECTOR_FILE="" # To specify if there's only one
  8 +VECTOR_FILES_ONE=false # Specify there's only one file
  9 +
  10 +KMIN=2
  11 +KMAX=100
config/archives/pvector_layer4_config.sh
  1 +OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer4"
  2 +DATADIR="data"
  3 +NEW_LSTDIR="${OUTDIR}/lst"
  4 +
  5 +VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_4"
  6 +VECTOR_FILES_END=".txt"
  7 +VECTOR_FILE="" # To specify if there's only one
  8 +VECTOR_FILES_ONE=false # Specify there's only one file
  9 +
  10 +KMIN=2
  11 +KMAX=100
config/archives/xvector_config.sh
  1 +OUTDIR="exp/kmeans_euclidian/xvectors"
  2 +DATADIR="data"
  3 +NEW_LSTDIR="${OUTDIR}/lst"
  4 +
  5 +VECTOR_FILE="data/xvectors.txt" # To specify if there's only one
  6 +VECTOR_FILES_ONE=true # Specify there's only one file
  7 +
  8 +KMIN=2
  9 +KMAX=100
  1 +OUTDIR="exp/kmeans_euclidian/iv"
  2 +DATADIR="data"
  3 +NEW_LSTDIR="${OUTDIR}/lst"
  4 +
  5 +VECTOR_FILE="data/ivectors.txt" # To specify if there's only one
  6 +VECTOR_FILES_ONE=true # Specify there's only one file
  7 +
  8 +METAS_CHARACTER="data/masseffect.lst"
  9 +CHAR_INFO="data/masseffect_character_information.csv"
  10 +
  11 +ORIGINAL_VECTOR_FILE="${VECTOR_FILE}"
  12 +
  13 +KMIN=2
  14 +KMAX=100
config/config_iv_skyrim.sh
  1 +OUTDIR="exp/kmeans_euclidian_skyrim/iv"
  2 +DATADIR="data"
  3 +NEW_LSTDIR="${OUTDIR}/lst"
  4 +
  5 +VECTOR_FILE="../data/skyrim/skyrim_ivectors.txt" # To specify if there's only one
  6 +VECTOR_FILES_ONE=true # Specify there's only one file
  7 +
  8 +METAS_CHARACTER="../data/skyrim/skyrim.lst"
  9 +CHAR_INFO="data/skyrim_character_information.csv"
  10 +
  11 +ORIGINAL_VECTOR_FILE="${VECTOR_FILE}"
  12 +
  13 +KMIN=2
  14 +KMAX=100
config/config_pv_from_iv.sh
  1 +
  2 +if [ -z "$kfold" ]
  3 +then
  4 + kfold=1
  5 +fi
  6 +
  7 +if [ -z "${t}" ]
  8 +then
  9 + t=2.0
  10 +fi
  11 +
  12 +OUTDIR="exp/kmeans_euclidian/pv_from_iv/${kfold}"
  13 +DATADIR="data"
  14 +MOTHER_LST_DIR="/local_disk/pegasus/laboinfo/mquillot/vocal_similarity_system/data/prot_alpha"
  15 +NEW_LSTDIR="${OUTDIR}/lst"
  16 +
  17 +
  18 +VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/exp/kd_iv/${kfold}/${t}/teacher/masseffect_pvectors.txt" # To specify if there's only one
  19 +VECTOR_FILES_ONE=true # Specify there's only one file
  20 +ORIGINAL_VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/data/masseffect.txt"
  21 +
  22 +
  23 +MIN_KFOLD=${kfold}
  24 +MAX_KFOLD=${kfold}
  25 +
  26 +KMIN=2
  27 +KMAX=100
config/config_pv_from_xv.sh
  1 +
  2 +if [ -z "$kfold" ]
  3 +then
  4 + kfold=1
  5 +fi
  6 +
  7 +if [ -z "${t}" ]
  8 +then
  9 + t=2.0
  10 +fi
  11 +
  12 +OUTDIR="exp/kmeans_euclidian/pv_from_xv/${kfold}"
  13 +DATADIR="data"
  14 +MOTHER_LST_DIR="/local_disk/pegasus/laboinfo/mquillot/vocal_similarity_system/data/prot_alpha"
  15 +NEW_LSTDIR="${OUTDIR}/lst"
  16 +
  17 +
  18 +VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/exp/kd_xvectors/${kfold}/${t}/teacher/masseffect_pvectors.txt" # To specify if there's only one
  19 +VECTOR_FILES_ONE=true # Specify there's only one file
  20 +ORIGINAL_VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/data/masseffect_xvectors.txt"
  21 +
  22 +MIN_KFOLD=${kfold}
  23 +MAX_KFOLD=${kfold}
  24 +
  25 +KMIN=2
  26 +KMAX=100
config/config_without_kfold_iv.sh
  1 +OUTDIR="exp/kmeans_euclidian_skyrim/ivectors"
  2 +DATADIR="data"
  3 +NEW_LSTDIR="${OUTDIR}/lst"
  4 +
  5 +LST_FILE="/local_disk/pegasus/laboinfo/mquillot/data/skyrim/skyrim_ivectors.txt"
  6 +VECTOR_FILE="data/ivectors.txt" # To specify if there's only one
  7 +VECTOR_FILES_ONE=true # Specify there's only one file
  8 +
  9 +WITHOUT_KFOLD=""
  10 +KMIN=2
  11 +KMAX=100
  12 +
  13 +METAS_CHARACTER=""
  1 +OUTDIR="exp/kmeans_euclidian/xv"
  2 +DATADIR="data"
  3 +NEW_LSTDIR="${OUTDIR}/lst"
  4 +
  5 +VECTOR_FILE="data/xvectors.txt" # To specify if there's only one
  6 +VECTOR_FILES_ONE=true # Specify there's only one file
  7 +
  8 +ORIGINAL_VECTOR_FILE="${VECTOR_FILE}"
  9 +KMIN=2
  10 +KMAX=100
config/ivector_config.sh
1   -OUTDIR="exp/kmeans_euclidian/ivectors"
2   -DATADIR="data"
3   -NEW_LSTDIR="${OUTDIR}/lst"
4   -
5   -VECTOR_FILE="data/ivectors.txt" # To specify if there's only one
6   -VECTOR_FILES_ONE=true # Specify there's only one file
7   -
8   -KMIN=2
9   -KMAX=100
config/pv_from_xv_config.sh
1   -
2   -# Framework configuration
3   -OUTDIR="exp/kmeans_euclidian/pv_from_xv"
4   -DATADIR="data"
5   -NEW_LSTDIR="${OUTDIR}/lst"
6   -
7   -VECTOR_FILES_BEGIN="${DATADIR}/pv_from_xv/me_pv_teacher"
8   -VECTOR_FILES_END=".txt"
9   -VECTOR_FILE="" # To specify if there's only one
10   -VECTOR_FILES_ONE=false # Specify there's only one file
11   -
12   -KMIN=2
13   -KMAX=100
config/pvector_config.sh
1   -
2   -OUTDIR="exp/kmeans_euclidian/teacher-pvector-1"
3   -DATADIR="data"
4   -NEW_LSTDIR="${OUTDIR}/lst"
5   -
6   -VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher"
7   -VECTOR_FILES_END=".txt"
8   -VECTOR_FILE="" # To specify if there's only one
9   -VECTOR_FILES_ONE=false # Specify there's only one file
10   -
11   -KMIN=2
12   -KMAX=100
config/pvector_layer1_config.sh
1   -OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer1"
2   -DATADIR="data"
3   -NEW_LSTDIR="${OUTDIR}/lst"
4   -
5   -VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_1"
6   -VECTOR_FILES_END=".txt"
7   -VECTOR_FILE="" # To specify if there's only one
8   -VECTOR_FILES_ONE=false # Specify there's only one file
9   -
10   -KMIN=2
11   -KMAX=100
config/pvector_layer2_config.sh
1   -OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer2"
2   -DATADIR="data"
3   -NEW_LSTDIR="${OUTDIR}/lst"
4   -
5   -VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_2"
6   -VECTOR_FILES_END=".txt"
7   -VECTOR_FILE="" # To specify if there's only one
8   -VECTOR_FILES_ONE=false # Specify there's only one file
9   -
10   -KMIN=2
11   -KMAX=100
config/pvector_layer3_config.sh
1   -OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer3"
2   -DATADIR="data"
3   -NEW_LSTDIR="${OUTDIR}/lst"
4   -
5   -VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_3"
6   -VECTOR_FILES_END=".txt"
7   -VECTOR_FILE="" # To specify if there's only one
8   -VECTOR_FILES_ONE=false # Specify there's only one file
9   -
10   -KMIN=2
11   -KMAX=100
config/pvector_layer4_config.sh
1   -OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer4"
2   -DATADIR="data"
3   -NEW_LSTDIR="${OUTDIR}/lst"
4   -
5   -VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_4"
6   -VECTOR_FILES_END=".txt"
7   -VECTOR_FILE="" # To specify if there's only one
8   -VECTOR_FILES_ONE=false # Specify there's only one file
9   -
10   -KMIN=2
11   -KMAX=100
config/xvector_config.sh
1   -OUTDIR="exp/kmeans_euclidian/xvectors"
2   -DATADIR="data"
3   -NEW_LSTDIR="${OUTDIR}/lst"
4   -
5   -VECTOR_FILE="data/xvectors.txt" # To specify if there's only one
6   -VECTOR_FILES_ONE=true # Specify there's only one file
7   -
8   -KMIN=2
9   -KMAX=100
extract-labels-pv-from-xv.sh
1   -
2   -
3   -# Number of set
4   -k=4
5   -
6   -
7   -# Vector features file
8   -DATADIR="data"
9   -
10   -VECTOR_FILE_MASSEFFECT="${DATADIR}/xvectors.txt"
11   -
12   -for kmean in 12 41 45 50 6 69 72 88
13   -do
14   - echo "KMEAN: ${kmean}"
15   - # Dirs
16   - EXP_DIR="exp/kmeans_euclidian/pv_from_xv/${k}/${kmean}"
17   - CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl"
18   -
19   -
20   - # Output dirs
21   - OUTFILE_MASSEFFECT="data/pv_from_xv/saved_clustered/masseffect_clustered_${k}_${kmean}.txt"
22   - echo "Extracting"
23   - python3 bin/extract_kmeans.py "${CLUSTERING}" \
24   - "${VECTOR_FILE_MASSEFFECT}" \
25   - --outfile "$OUTFILE_MASSEFFECT"
26   - echo "End extracting"
27   -done
extract-labels.sh
1   -
2   -
3   -# Number of set
4   -k=4
5   -kmean=88
6   -
7   -
8   -# Vector features file
9   -VECTOR_FILE_MASSEFFECT="data/xvectors.txt"
10   -
11   -
12   -# Dirs
13   -EXP_DIR="exp/kmeans_euclidian/xvectors/${k}/${kmean}"
14   -CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl"
15   -
16   -
17   -# Output dirs
18   -OUTFILE_MASSEFFECT="data/xvectors/saved_clustered/masseffect_clustered_xvectors_${k}_${kmean}.txt"
19   -
20   -python3 bin/extract_kmeans.py "${CLUSTERING}" \
21   - "${VECTOR_FILE_MASSEFFECT}" \
22   - --outfile "$OUTFILE_MASSEFFECT"
rm-unused-files.sh
1   -
2   -if [ $# -eq 1 ]
3   -then
4   - EXP_DIR="$1"
5   -else
6   - echo "Need to have one and only one argument. This argument is the exp directory."
7   - exit 1
8   -fi
9   -
10   -for kfold in {1..4}
11   -do
12   - for k in {1..100}
13   - do
14   - rm ${EXP_DIR}/$kfold/$k/clustered_$k.txt
15   - done
16   -done
... ... @@ -29,7 +29,7 @@
29 29  
30 30  
31 31 # -- TRAIN KMEANS
32   -echo "Clustering - ${kfold}"
  32 +echo "Clustering - ${kfold}"sss
33 33 python3 bin/cluster_kmeans.py "${VECTOR_FILE}" \
34 34 "${TRAIN_LST}" \
35 35 "${EXP_DIR}" --kmin ${KMIN} --kmax ${KMAX}
... ... @@ -2,7 +2,9 @@
2 2 # quelques petites commandes que l'on souhaite
3 3 # tester.
4 4  
5   -OUTDIR="exp/kmeans_euclidian/teacher-pvector-1"
  5 +set -e
  6 +
  7 +OUTDIR="exp/kmeans_euclidian/ivectors"
6 8 EXP_DIR=${OUTDIR}
7 9 DATADIR="data"
8 10 NEW_LSTDIR="${OUTDIR}/lst"
... ... @@ -22,8 +24,8 @@
22 24  
23 25 for kfold in {1..4}
24 26 do
25   - pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt"
26   - VECTOR_FILE=$pvector_file
  27 + #pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt"
  28 + VECTOR_FILE="${DATADIR}/ivectors.txt"
27 29 lst_dir="${DATADIR}/pvectors_1rst/lst"
28 30 output_kfold="${OUTDIR}/${kfold}"
29 31  
30 32  
31 33  
32 34  
... ... @@ -61,58 +63,19 @@
61 63 do
62 64 echo "Kmeans Measuring and ploting - ${k}"
63 65  
64   - SUB_EXP_DIR="${EXP_DIR}/${kfold}/${k}"
  66 + SUB_EXP_DIR="${EXP_DIR}/${kfold}/${k}"
65 67  
66   - # -- EXTRACT CLUSTERING LABELS
67   - python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \
68   - "${VECTOR_FILE}" \
69   - --outfile "${SUB_EXP_DIR}/clustered_${k}.txt"
  68 + # -- EXTRACT CLUSTERING LABELS
  69 + python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \
  70 + "${VECTOR_FILE}" \
  71 + --outfile "${SUB_EXP_DIR}/clustered_${k}.txt"
70 72  
71   - # -- MEASURES AND PLOT WITH RESPECT TO LANG VAR
72   - # Measures
73   - python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
74   - "${METAS_LANG}" \
75   - "${TRAIN_LST}" \
76   - "${VAL_LST}" \
77   - --outfile "${SUB_EXP_DIR}/measures_lang.json"
  73 + # -- MEASURES AND PLOT
  74 + source steps/measure_clustering_char.sh
  75 + source steps/measure_clustering_type.sh
  76 + source steps/measure_clustering_lang.sh
78 77  
79   - # This script plot the count matrix of the train set
80   - python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
81   - "${METAS_LANG}" \
82   - "${TRAIN_LST}" \
83   - --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf"
84   -
85   - # This script plot the count matrix of the validation set
86   - python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
87   - "${METAS_LANG}" \
88   - "${VAL_LST}" \
89   - --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf"
90   -
91   - rm ${SUB_EXP_DIR}/clustered_${k}.txt
92   - #python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \
93   - # "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \
94   - # "${lst_dir}/val_${kfold}.lst" \
95   - # --outfile "${output_kfold}/${k}/measures_type.json"
96   -
97   - # This script plot the count matrix of the train set
98   - #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
99   - # ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \
100   - # --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf
101   -
102   - # This script plot the count matrix of the validation set
103   - #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
104   - # ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \
105   - # --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf
106   -
107   - # This script plot the count matrix of the train set
108   - #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
109   - # ${pvector_file} ${lst_dir}/train_${kfold}.lst \
110   - # --outfile ${output_kfold}/${k}/train_count_matrix.pdf
111   -
112   - # This script plot the count matrix of the validation set
113   - #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
114   - # ${pvector_file} ${lst_dir}/val_${kfold}.lst \
115   - # --outfile ${output_kfold}/${k}/val_count_matrix.pdf
  78 + rm ${SUB_EXP_DIR}/clustered_${k}.txt
116 79 done
117 80 done
  1 +python bin/cluster_kmeans.py ../data/skyrim/skyrim_ivectors.txt ../data/skyrim/skyrim.lst exp/kmeans_euclidian_skyrim/ivectors/ --kmin 1 --kmax 100
... ... @@ -31,7 +31,17 @@
31 31 fi
32 32  
33 33  
  34 +if [ -z "$METAS_CHARACTER" ]
  35 +then
  36 + METAS_CHARACTER="${DATADIR}/masseffect.lst"
  37 +fi
34 38  
  39 +
  40 +if [ -z "$CHAR_INFO" ]
  41 +then
  42 + CHAR_INFO="${DATADIR}/character_information.csv"
  43 +fi
  44 +
35 45 # -- MAKE DIRECTORIES
36 46 if [ ! -d "$OUTDIR" ];
37 47 then
... ... @@ -59,7 +69,6 @@
59 69 for kfold in $(seq ${MIN_KFOLD} ${MAX_KFOLD})
60 70 do
61 71 # Some usefull variable
62   - CHAR_INFO="${DATADIR}/character_information.csv"
63 72 TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst"
64 73 VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst"
65 74 TRAIN_LANG_LST="${NEW_LSTDIR}/train_${kfold}_lang.lst"
66 75  
... ... @@ -71,11 +80,10 @@
71 80 VECTOR_FILE="${VECTOR_FILES_BEGIN}_${kfold}${VECTOR_FILES_END}"
72 81 fi
73 82  
74   - TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst"
75   - VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst"
  83 + TRAIN_LST="${MOTHER_LST_DIR}/lst/train_${kfold}.lst"
  84 + VAL_LST="${MOTHER_LST_DIR}/lst/val_${kfold}.lst"
76 85 EXP_DIR="${OUTDIR}/${kfold}"
77 86 METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst"
78   - METAS_CHARACTER="${DATADIR}/masseffect.lst"
79 87 METAS_LANG="${NEW_LSTDIR}/metas_${kfold}_lang.lst"
80 88  
81 89  
  1 +
  2 +for kfold in `seq 1 4`
  3 +do
  4 + echo "KFOLD: ${kfold}"
  5 + source run.sh
  6 +done
run_without_kfold.sh
  1 +
  2 +for k in $(seq ${KMIN} 1 ${KMAX})
  3 +do
  4 + SUB_EXP_DIR="${EXP_DIR}/${k}"
  5 +
  6 + # -- EXTRACT KMEANS VALUES
  7 + echo "Kmeans Measuring and extraction - ${k}"
  8 + python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \
  9 + "${VECTOR_FILE}" \
  10 + --outfile "${SUB_EXP_DIR}/clustered_${k}.txt"
  11 +
  12 + python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
  13 + "${METAS_CHARACTER}" \
  14 + "${TRAIN_LST}" \
  15 + "${VAL_LST}" \
  16 + --outfile "${SUB_EXP_DIR}/measures.json"
steps/extract_cluster_file.sh
  1 +
  2 +for kfold in `seq 1 4`
  3 +do
  4 + source $1
  5 + vector_file=${VECTOR_FILE}
  6 + echo "kfold: $kfold"
  7 + for kmean in `seq 2 100`
  8 + do
  9 + echo "kmean: $kmean"
  10 + exp_dir="${OUTDIR}/${kfold}/${kmean}"
  11 + clustering="${exp_dir}/clustering_${kmean}.pkl"
  12 + save_loc="${exp_dir}"
  13 + saved_txt="${save_loc}/masseffect_clustered.txt"
  14 + saved_lst="${save_loc}/masseffect_clustered.lst"
  15 +
  16 + python3 bin/extract_kmeans.py "${clustering}" \
  17 + "${vector_file}" \
  18 + --outfile "${saved_txt}"
  19 +
  20 + cat ${saved_txt} | cut -d" " -f1 > ${saved_lst}
  21 +
  22 + python3 bin/replace-features.py "${ORIGINAL_VECTOR_FILE}" "${saved_txt}"
  23 + done
  24 +done
steps/extract_cluster_file_skyrim.sh
  1 +
  2 +source $1
  3 +vector_file=${VECTOR_FILE}
  4 +echo "kfold: $kfold"
  5 +for kmean in `seq 2 100`
  6 +do
  7 + echo "kmean: $kmean"
  8 + exp_dir="${OUTDIR}/${kmean}"
  9 + clustering="${exp_dir}/clustering_${kmean}.pkl"
  10 + save_loc="${exp_dir}"
  11 + saved_txt="${save_loc}/masseffect_clustered.txt"
  12 + saved_lst="${save_loc}/masseffect_clustered.lst"
  13 +
  14 + python3 bin/extract_kmeans.py "${clustering}" \
  15 + "${vector_file}" \
  16 + --outfile "${saved_txt}"
  17 +
  18 + cat ${saved_txt} | cut -d" " -f1 > ${saved_lst}
  19 +
  20 + python3 bin/replace-features.py "${ORIGINAL_VECTOR_FILE}" "${saved_txt}"
  21 +done
steps/extract_language_lst.sh
  1 +DATADIR="data"
  2 +OUTDIR="exp/kmeans_euclidian/ivectors"
  3 +NEW_LSTDIR="${OUTDIR}/lst"
  4 +
  5 +TRAIN_LST=${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst
  6 +VAL_LST=${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst
  7 +TRAIN_LANG_LST=${NEW_LSTDIR}/train_${kfold}_lang.lst
  8 +VAL_LANG_LST=${NEW_LSTDIR}/val_${kfold}_lang.lst
  9 +METAS_LANG=${NEW_LSTDIR}/metas_${kfold}_lang.lst
  10 +
  11 +
  12 +awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST}
  13 +echo "VAL EXTRACT LANGUAGE INFO DONE"
  14 +awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST}
  15 +echo "TRAIN EXTRACT LANGUAGE INFO DONE"
  16 +cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}"
  17 +echo "GLOBAL EXTRACT LANGUAGE INFO DONE"
steps/measure_clustering_char.sh
  1 +
  2 +python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \
  3 + "${lst_dir}/trainval_${kfold}.lst" "${lst_dir}/train_${kfold}.lst" \
  4 + "${lst_dir}/val_${kfold}.lst" \
  5 + --outfile "${output_kfold}/${k}/measures.json"
  6 +
  7 +
  8 +# This script plot the count matrix of the train set
  9 +python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
  10 + "${lst_dir}/train_${kfold}.lst" \
  11 + "${lst_dir}/train_${kfold}.lst" \
  12 + --outfile "${SUB_EXP_DIR}/train_count_matrix.pdf"
  13 +
  14 +# This script plot the count matrix of the validation set
  15 +python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
  16 + "${lst_dir}/val_${kfold}.lst" \
  17 + "${lst_dir}/val_${kfold}.lst" \
  18 + --outfile "${SUB_EXP_DIR}/val_count_matrix.pdf"
steps/measure_clustering_lang.sh
  1 +
  2 +python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
  3 + "${METAS_LANG}" \
  4 + "${TRAIN_LST}" \
  5 + "${VAL_LST}" \
  6 + --outfile "${SUB_EXP_DIR}/measures_lang.json"
  7 +
  8 +# This script plot the count matrix of the train set
  9 +python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
  10 + "${METAS_LANG}" \
  11 + "${TRAIN_LST}" \
  12 + --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf"
  13 +
  14 +# This script plot the count matrix of the validation set
  15 +python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
  16 + "${METAS_LANG}" \
  17 + "${VAL_LST}" \
  18 + --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf"
steps/measure_clustering_type.sh
  1 +python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \
  2 + "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \
  3 + "${lst_dir}/val_${kfold}.lst" \
  4 + --outfile "${output_kfold}/${k}/measures_type.json"
  5 +
  6 +# This script plot the count matrix of the train set
  7 +python3 bin/plot-count-matrix.py "${output_kfold}/${k}/clustered_${k}.txt" \
  8 + "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \
  9 + --outfile "${output_kfold}/${k}/train_count_matrix_type.pdf"
  10 +
  11 +# This script plot the count matrix of the validation set
  12 +python3 bin/plot-count-matrix.py "${output_kfold}/${k}/clustered_${k}.txt" \
  13 + "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/val_${kfold}.lst" \
  14 + --outfile "${output_kfold}/${k}/val_count_matrix_type.pdf"
  15 +
steps/save_clusters_file.sh
  1 +
  2 +vector_file="data/xvectors.txt"
  3 +
  4 +for kfold in `seq 1 4`
  5 +do
  6 + echo "kfold: $kfold"
  7 + for kmean in `seq 2 100`
  8 + do
  9 + echo "kmean: $kmean"
  10 + exp_dir="exp/kmeans_euclidian/xvectors/${kfold}/${kmean}"
  11 + clustering="${exp_dir}/clustering_${kmean}.pkl"
  12 + save_loc="data/xvectors/saved_clustered/"
  13 + saved_txt="${save_loc}/masseffect_clustered_xvectors_${kfold}_${kmean}.txt"
  14 + saved_lst="${save_loc}/masseffect_clustered_xvectors_${kfold}_${kmean}.lst"
  15 +
  16 + python3 bin/extract_kmeans.py "${clustering}" \
  17 + "${vector_file}" \
  18 + --outfile "${saved_txt}"
  19 +
  20 + cat ${saved_txt} | cut -d" " -f1 > ${saved_lst}
  21 + done
  22 +done
utils/extract-labels.sh
  1 +
  2 +
  3 +# Number of set
  4 +k=4
  5 +kmean=88
  6 +
  7 +
  8 +# Vector features file
  9 +VECTOR_FILE_MASSEFFECT="data/xvectors.txt"
  10 +
  11 +
  12 +# Dirs
  13 +EXP_DIR="exp/kmeans_euclidian/xvectors/${k}/${kmean}"
  14 +CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl"
  15 +
  16 +
  17 +# Output dirs
  18 +OUTFILE_MASSEFFECT="data/xvectors/saved_clustered/masseffect_clustered_xvectors_${k}_${kmean}.txt"
  19 +
  20 +python3 bin/extract_kmeans.py "${CLUSTERING}" \
  21 + "${VECTOR_FILE_MASSEFFECT}" \
  22 + --outfile "$OUTFILE_MASSEFFECT"
utils/rm-unused-files.sh
  1 +
  2 +if [ $# -eq 1 ]
  3 +then
  4 + EXP_DIR="$1"
  5 +else
  6 + echo "Need to have one and only one argument. This argument is the exp directory."
  7 + exit 1
  8 +fi
  9 +
  10 +for kfold in {1..4}
  11 +do
  12 + for k in {1..100}
  13 + do
  14 + rm ${EXP_DIR}/$kfold/$k/clustered_$k.txt
  15 + done
  16 +done
utils/transform_exp_to_kd.sh
  1 +
  2 +# -- DESCRIPTION --
  3 +#
  4 +# This script aims to transform data in a shape that is
  5 +# usable mainly by knowledge distillation scripts.
  6 +#
  7 +# Firstly, it extracts clustering labels
  8 +# then change features with the given one
  9 +# and finally generate a list file.
  10 +#
  11 +# The pair features files and list file will be usable
  12 +# by the knowledge distillation system.
  13 +# --------------------
  14 +
  15 +
  16 +# -- CONFIGURATION --
  17 +# Configuration error
  18 +set -e
  19 +
  20 +# KFOLD config
  21 +MIN_KFOLD=1
  22 +MAX_KFOLD=4
  23 +
  24 +# KMEAN config
  25 +MIN_KMEAN=2
  26 +MAX_KMEAN=100
  27 +
  28 +# Vector features file
  29 +DATADIR="data"
  30 +FEATURES_DIR="${DATADIR}/pv_from_xv"
  31 +FEATURES_PREFIX="me_pv_teacher"
  32 +FEATURES_SUFFIX=".txt"
  33 +
  34 +EXP_DIR="exp/kmeans_euclidian/pv_from_xv"
  35 +VECTOR_FILE_MASSEFFECT="${DATADIR}/xvectors.txt"
  36 +OUTDIR="data/pv_from_xv/saved_clustered"
  37 +
  38 +# -- CREATE DIRECTORIES
  39 +# OUTPUT DIRECTORY
  40 +if [ ! -d "${OUTDIR}" ]
  41 +then
  42 + mkdir -p ${OUTDIR}
  43 +fi
  44 +
  45 +
  46 +# -- FUNCTIONS --
  47 +# Definition of the transform function
  48 +function transform() {
  49 + # Define subdir variable
  50 + local SUB_EXP_DIR="${EXP_DIR}/${k}/${kmean}"
  51 +
  52 + # Define features file variable
  53 + local INITIAL_VECTOR_FILE="${FEATURES_DIR}/${FEATURES_PREFIX}_${k}${FEATURES_SUFFIX}"
  54 +
  55 + # Information of the current process
  56 + echo "[KFOLD, KMEAN]: [${k}, ${kmean}]"
  57 +
  58 + # Define clustering model variable
  59 + local CLUSTERING="${SUB_EXP_DIR}/clustering_${kmean}.pkl"
  60 +
  61 +
  62 + # Define output file
  63 + local OUTFILE_MASSEFFECT="${OUTDIR}/masseffect_clustered_${k}_${kmean}.txt"
  64 +
  65 + # Extracting clustering labels
  66 + echo "Extracting clustering labels"
  67 + python3 bin/extract_kmeans.py "${CLUSTERING}" \
  68 + "${INITIAL_VECTOR_FILE}" \
  69 + --outfile "${OUTFILE_MASSEFFECT}"
  70 +
  71 + # Changing features
  72 + echo "Changing features"
  73 + python bin/replace-features.py ${VECTOR_FILE_MASSEFFECT} ${OUTFILE_MASSEFFECT}
  74 +
  75 + # Extracting list file
  76 + cut -d' ' -f1 ${OUTFILE_MASSEFFECT} > "${OUTDIR}/masseffect_clustered_${k}_${kmean}.lst"
  77 +}
  78 +
  79 +
  80 +# -- MAIN LOOPS
  81 +for k in $(seq ${MIN_KFOLD} ${MAX_KFOLD})
  82 +do
  83 + for kmean in $(seq ${MIN_KMEAN} ${MAX_KMEAN})
  84 + do
  85 + transform
  86 + done
  87 +done