Commit e63ab06fc786597258d861e68c335de9e2afceb4
1 parent
c95c2bf75c
Exists in
master
New organisation of the project
Showing 44 changed files with 544 additions and 210 deletions Side-by-side Diff
- README.md
- bin/regroup-measures.py
- bin/replace-features.py
- config/archives/ivector_config.sh
- config/archives/pv_from_xv_config.sh
- config/archives/pvector_config.sh
- config/archives/pvector_layer1_config.sh
- config/archives/pvector_layer2_config.sh
- config/archives/pvector_layer3_config.sh
- config/archives/pvector_layer4_config.sh
- config/archives/xvector_config.sh
- config/config_iv.sh
- config/config_iv_skyrim.sh
- config/config_pv_from_iv.sh
- config/config_pv_from_xv.sh
- config/config_without_kfold_iv.sh
- config/config_xv.sh
- config/ivector_config.sh
- config/pv_from_xv_config.sh
- config/pvector_config.sh
- config/pvector_layer1_config.sh
- config/pvector_layer2_config.sh
- config/pvector_layer3_config.sh
- config/pvector_layer4_config.sh
- config/xvector_config.sh
- extract-labels-pv-from-xv.sh
- extract-labels.sh
- rm-unused-files.sh
- run-clustering.sh
- run-measures.sh
- run-skyrim.sh
- run.sh
- run_kfold.sh
- run_without_kfold.sh
- steps/extract_cluster_file.sh
- steps/extract_cluster_file_skyrim.sh
- steps/extract_language_lst.sh
- steps/measure_clustering_char.sh
- steps/measure_clustering_lang.sh
- steps/measure_clustering_type.sh
- steps/save_clusters_file.sh
- utils/extract-labels.sh
- utils/rm-unused-files.sh
- utils/transform_exp_to_kd.sh
README.md
1 | 1 | # Clustering |
2 | 2 | A repository where i put everything dealing with clustering algorithms. |
3 | 3 | |
4 | +# How to use | |
5 | +You can run directly the run.sh script if you want. You just need data. | |
6 | + | |
7 | +You can use some scripts in utils tool, but run these scripts from the root directory "clustering/". | |
8 | + | |
4 | 9 | # TODO |
5 | 10 | - Organiser les différentes listes de données pour mes expériences |
6 | 11 | - Create a data file example |
bin/regroup-measures.py
... | ... | @@ -40,6 +40,8 @@ |
40 | 40 | # -- PARSER |
41 | 41 | parser = argparse.ArgumentParser(description="") |
42 | 42 | parser.add_argument("expdir", type=str, help="Directory of experiment") |
43 | +parser.add_argument("--nkfold", type=int, default=4, help="number of kfold") | |
44 | +parser.add_argument("--nkfoldmin", type=int, default=1, help="Begin with this numero of kfold") | |
43 | 45 | parser.add_argument("--measurefile", type=str, default="measures.json", |
44 | 46 | help="Measure file it searchs in folders") |
45 | 47 | parser.add_argument("--suffix", type=str, default="", |
... | ... | @@ -49,6 +51,8 @@ |
49 | 51 | EXP_DIR = args.expdir |
50 | 52 | MEASURE_FILE = args.measurefile |
51 | 53 | SUFFIX = args.suffix |
54 | +MAX_KFOLD = args.nkfold | |
55 | +MIN_KFOLD = args.nkfoldmin | |
52 | 56 | |
53 | 57 | # EXP_DIR="exp/kmeans_teacher_1/pvector-1" |
54 | 58 | RESULTS_DIR = os.path.join(EXP_DIR, "res") |
... | ... | @@ -83,7 +87,7 @@ |
83 | 87 | |
84 | 88 | measures = init_measures() |
85 | 89 | |
86 | -for kfold in range(1, 5): | |
90 | +for kfold in range(MIN_KFOLD, MAX_KFOLD + 1): | |
87 | 91 | print("Regrouping on kfold: " + str(kfold)) |
88 | 92 | # -- REGROUP MEASURES INTO LISTS |
89 | 93 | for k in range(kmin, kmax+1): |
bin/replace-features.py
1 | + | |
2 | +import argparse | |
3 | + | |
4 | +from data import read_file, index_by_id, write_line | |
5 | + | |
6 | +# -- ARGPARSE | |
7 | +parser = argparse.ArgumentParser( | |
8 | + description="Replace features with file from to file to") | |
9 | +parser.add_argument("fromfile", type=str, help="From list or features file") | |
10 | +parser.add_argument("tofile", type=str, help="Features of 'from' saved into this file.") | |
11 | + | |
12 | +args = parser.parse_args() | |
13 | +FROM = args.fromfile | |
14 | +TO = args.tofile | |
15 | + | |
16 | + | |
17 | +# -- READ AND INDEX FILES | |
18 | +from_data = read_file(FROM) | |
19 | +from_by_id = index_by_id(from_data) | |
20 | + | |
21 | +to_data = read_file(TO) | |
22 | + | |
23 | +with open(TO, "w") as f: | |
24 | + for line in to_data: | |
25 | + metas = line[0] | |
26 | + features = from_by_id[metas[0]][metas[3]][1] | |
27 | + write_line(metas, features, f) |
config/archives/ivector_config.sh
config/archives/pv_from_xv_config.sh
1 | + | |
2 | +# Framework configuration | |
3 | +OUTDIR="exp/kmeans_euclidian/pv_from_xv" | |
4 | +DATADIR="data" | |
5 | +NEW_LSTDIR="${OUTDIR}/lst" | |
6 | + | |
7 | +VECTOR_FILES_BEGIN="${DATADIR}/pv_from_xv/me_pv_teacher" | |
8 | +VECTOR_FILES_END=".txt" | |
9 | +VECTOR_FILE="" # To specify if there's only one | |
10 | +VECTOR_FILES_ONE=false # Specify there's only one file | |
11 | + | |
12 | +KMIN=2 | |
13 | +KMAX=100 |
config/archives/pvector_config.sh
1 | + | |
2 | +OUTDIR="exp/kmeans_euclidian/teacher-pvector-1" | |
3 | +DATADIR="data" | |
4 | +NEW_LSTDIR="${OUTDIR}/lst" | |
5 | + | |
6 | +VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" | |
7 | +VECTOR_FILES_END=".txt" | |
8 | +VECTOR_FILE="" # To specify if there's only one | |
9 | +VECTOR_FILES_ONE=false # Specify there's only one file | |
10 | + | |
11 | +KMIN=2 | |
12 | +KMAX=100 |
config/archives/pvector_layer1_config.sh
1 | +OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer1" | |
2 | +DATADIR="data" | |
3 | +NEW_LSTDIR="${OUTDIR}/lst" | |
4 | + | |
5 | +VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_1" | |
6 | +VECTOR_FILES_END=".txt" | |
7 | +VECTOR_FILE="" # To specify if there's only one | |
8 | +VECTOR_FILES_ONE=false # Specify there's only one file | |
9 | + | |
10 | +KMIN=2 | |
11 | +KMAX=100 |
config/archives/pvector_layer2_config.sh
1 | +OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer2" | |
2 | +DATADIR="data" | |
3 | +NEW_LSTDIR="${OUTDIR}/lst" | |
4 | + | |
5 | +VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_2" | |
6 | +VECTOR_FILES_END=".txt" | |
7 | +VECTOR_FILE="" # To specify if there's only one | |
8 | +VECTOR_FILES_ONE=false # Specify there's only one file | |
9 | + | |
10 | +KMIN=2 | |
11 | +KMAX=100 |
config/archives/pvector_layer3_config.sh
1 | +OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer3" | |
2 | +DATADIR="data" | |
3 | +NEW_LSTDIR="${OUTDIR}/lst" | |
4 | + | |
5 | +VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_3" | |
6 | +VECTOR_FILES_END=".txt" | |
7 | +VECTOR_FILE="" # To specify if there's only one | |
8 | +VECTOR_FILES_ONE=false # Specify there's only one file | |
9 | + | |
10 | +KMIN=2 | |
11 | +KMAX=100 |
config/archives/pvector_layer4_config.sh
1 | +OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer4" | |
2 | +DATADIR="data" | |
3 | +NEW_LSTDIR="${OUTDIR}/lst" | |
4 | + | |
5 | +VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_4" | |
6 | +VECTOR_FILES_END=".txt" | |
7 | +VECTOR_FILE="" # To specify if there's only one | |
8 | +VECTOR_FILES_ONE=false # Specify there's only one file | |
9 | + | |
10 | +KMIN=2 | |
11 | +KMAX=100 |
config/archives/xvector_config.sh
config/config_iv.sh
1 | +OUTDIR="exp/kmeans_euclidian/iv" | |
2 | +DATADIR="data" | |
3 | +NEW_LSTDIR="${OUTDIR}/lst" | |
4 | + | |
5 | +VECTOR_FILE="data/ivectors.txt" # To specify if there's only one | |
6 | +VECTOR_FILES_ONE=true # Specify there's only one file | |
7 | + | |
8 | +METAS_CHARACTER="data/masseffect.lst" | |
9 | +CHAR_INFO="data/masseffect_character_information.csv" | |
10 | + | |
11 | +ORIGINAL_VECTOR_FILE="${VECTOR_FILE}" | |
12 | + | |
13 | +KMIN=2 | |
14 | +KMAX=100 |
config/config_iv_skyrim.sh
1 | +OUTDIR="exp/kmeans_euclidian_skyrim/iv" | |
2 | +DATADIR="data" | |
3 | +NEW_LSTDIR="${OUTDIR}/lst" | |
4 | + | |
5 | +VECTOR_FILE="../data/skyrim/skyrim_ivectors.txt" # To specify if there's only one | |
6 | +VECTOR_FILES_ONE=true # Specify there's only one file | |
7 | + | |
8 | +METAS_CHARACTER="../data/skyrim/skyrim.lst" | |
9 | +CHAR_INFO="data/skyrim_character_information.csv" | |
10 | + | |
11 | +ORIGINAL_VECTOR_FILE="${VECTOR_FILE}" | |
12 | + | |
13 | +KMIN=2 | |
14 | +KMAX=100 |
config/config_pv_from_iv.sh
1 | + | |
2 | +if [ -z "$kfold" ] | |
3 | +then | |
4 | + kfold=1 | |
5 | +fi | |
6 | + | |
7 | +if [ -z "${t}" ] | |
8 | +then | |
9 | + t=2.0 | |
10 | +fi | |
11 | + | |
12 | +OUTDIR="exp/kmeans_euclidian/pv_from_iv/${kfold}" | |
13 | +DATADIR="data" | |
14 | +MOTHER_LST_DIR="/local_disk/pegasus/laboinfo/mquillot/vocal_similarity_system/data/prot_alpha" | |
15 | +NEW_LSTDIR="${OUTDIR}/lst" | |
16 | + | |
17 | + | |
18 | +VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/exp/kd_iv/${kfold}/${t}/teacher/masseffect_pvectors.txt" # To specify if there's only one | |
19 | +VECTOR_FILES_ONE=true # Specify there's only one file | |
20 | +ORIGINAL_VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/data/masseffect.txt" | |
21 | + | |
22 | + | |
23 | +MIN_KFOLD=${kfold} | |
24 | +MAX_KFOLD=${kfold} | |
25 | + | |
26 | +KMIN=2 | |
27 | +KMAX=100 |
config/config_pv_from_xv.sh
1 | + | |
2 | +if [ -z "$kfold" ] | |
3 | +then | |
4 | + kfold=1 | |
5 | +fi | |
6 | + | |
7 | +if [ -z "${t}" ] | |
8 | +then | |
9 | + t=2.0 | |
10 | +fi | |
11 | + | |
12 | +OUTDIR="exp/kmeans_euclidian/pv_from_xv/${kfold}" | |
13 | +DATADIR="data" | |
14 | +MOTHER_LST_DIR="/local_disk/pegasus/laboinfo/mquillot/vocal_similarity_system/data/prot_alpha" | |
15 | +NEW_LSTDIR="${OUTDIR}/lst" | |
16 | + | |
17 | + | |
18 | +VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/exp/kd_xvectors/${kfold}/${t}/teacher/masseffect_pvectors.txt" # To specify if there's only one | |
19 | +VECTOR_FILES_ONE=true # Specify there's only one file | |
20 | +ORIGINAL_VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/data/masseffect_xvectors.txt" | |
21 | + | |
22 | +MIN_KFOLD=${kfold} | |
23 | +MAX_KFOLD=${kfold} | |
24 | + | |
25 | +KMIN=2 | |
26 | +KMAX=100 |
config/config_without_kfold_iv.sh
1 | +OUTDIR="exp/kmeans_euclidian_skyrim/ivectors" | |
2 | +DATADIR="data" | |
3 | +NEW_LSTDIR="${OUTDIR}/lst" | |
4 | + | |
5 | +LST_FILE="/local_disk/pegasus/laboinfo/mquillot/data/skyrim/skyrim_ivectors.txt" | |
6 | +VECTOR_FILE="data/ivectors.txt" # To specify if there's only one | |
7 | +VECTOR_FILES_ONE=true # Specify there's only one file | |
8 | + | |
9 | +WITHOUT_KFOLD="" | |
10 | +KMIN=2 | |
11 | +KMAX=100 | |
12 | + | |
13 | +METAS_CHARACTER="" |
config/config_xv.sh
config/ivector_config.sh
config/pv_from_xv_config.sh
1 | - | |
2 | -# Framework configuration | |
3 | -OUTDIR="exp/kmeans_euclidian/pv_from_xv" | |
4 | -DATADIR="data" | |
5 | -NEW_LSTDIR="${OUTDIR}/lst" | |
6 | - | |
7 | -VECTOR_FILES_BEGIN="${DATADIR}/pv_from_xv/me_pv_teacher" | |
8 | -VECTOR_FILES_END=".txt" | |
9 | -VECTOR_FILE="" # To specify if there's only one | |
10 | -VECTOR_FILES_ONE=false # Specify there's only one file | |
11 | - | |
12 | -KMIN=2 | |
13 | -KMAX=100 |
config/pvector_config.sh
1 | - | |
2 | -OUTDIR="exp/kmeans_euclidian/teacher-pvector-1" | |
3 | -DATADIR="data" | |
4 | -NEW_LSTDIR="${OUTDIR}/lst" | |
5 | - | |
6 | -VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" | |
7 | -VECTOR_FILES_END=".txt" | |
8 | -VECTOR_FILE="" # To specify if there's only one | |
9 | -VECTOR_FILES_ONE=false # Specify there's only one file | |
10 | - | |
11 | -KMIN=2 | |
12 | -KMAX=100 |
config/pvector_layer1_config.sh
1 | -OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer1" | |
2 | -DATADIR="data" | |
3 | -NEW_LSTDIR="${OUTDIR}/lst" | |
4 | - | |
5 | -VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_1" | |
6 | -VECTOR_FILES_END=".txt" | |
7 | -VECTOR_FILE="" # To specify if there's only one | |
8 | -VECTOR_FILES_ONE=false # Specify there's only one file | |
9 | - | |
10 | -KMIN=2 | |
11 | -KMAX=100 |
config/pvector_layer2_config.sh
1 | -OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer2" | |
2 | -DATADIR="data" | |
3 | -NEW_LSTDIR="${OUTDIR}/lst" | |
4 | - | |
5 | -VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_2" | |
6 | -VECTOR_FILES_END=".txt" | |
7 | -VECTOR_FILE="" # To specify if there's only one | |
8 | -VECTOR_FILES_ONE=false # Specify there's only one file | |
9 | - | |
10 | -KMIN=2 | |
11 | -KMAX=100 |
config/pvector_layer3_config.sh
1 | -OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer3" | |
2 | -DATADIR="data" | |
3 | -NEW_LSTDIR="${OUTDIR}/lst" | |
4 | - | |
5 | -VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_3" | |
6 | -VECTOR_FILES_END=".txt" | |
7 | -VECTOR_FILE="" # To specify if there's only one | |
8 | -VECTOR_FILES_ONE=false # Specify there's only one file | |
9 | - | |
10 | -KMIN=2 | |
11 | -KMAX=100 |
config/pvector_layer4_config.sh
1 | -OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer4" | |
2 | -DATADIR="data" | |
3 | -NEW_LSTDIR="${OUTDIR}/lst" | |
4 | - | |
5 | -VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_4" | |
6 | -VECTOR_FILES_END=".txt" | |
7 | -VECTOR_FILE="" # To specify if there's only one | |
8 | -VECTOR_FILES_ONE=false # Specify there's only one file | |
9 | - | |
10 | -KMIN=2 | |
11 | -KMAX=100 |
config/xvector_config.sh
extract-labels-pv-from-xv.sh
1 | - | |
2 | - | |
3 | -# Number of set | |
4 | -k=4 | |
5 | - | |
6 | - | |
7 | -# Vector features file | |
8 | -DATADIR="data" | |
9 | - | |
10 | -VECTOR_FILE_MASSEFFECT="${DATADIR}/xvectors.txt" | |
11 | - | |
12 | -for kmean in 12 41 45 50 6 69 72 88 | |
13 | -do | |
14 | - echo "KMEAN: ${kmean}" | |
15 | - # Dirs | |
16 | - EXP_DIR="exp/kmeans_euclidian/pv_from_xv/${k}/${kmean}" | |
17 | - CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl" | |
18 | - | |
19 | - | |
20 | - # Output dirs | |
21 | - OUTFILE_MASSEFFECT="data/pv_from_xv/saved_clustered/masseffect_clustered_${k}_${kmean}.txt" | |
22 | - echo "Extracting" | |
23 | - python3 bin/extract_kmeans.py "${CLUSTERING}" \ | |
24 | - "${VECTOR_FILE_MASSEFFECT}" \ | |
25 | - --outfile "$OUTFILE_MASSEFFECT" | |
26 | - echo "End extracting" | |
27 | -done |
extract-labels.sh
1 | - | |
2 | - | |
3 | -# Number of set | |
4 | -k=4 | |
5 | -kmean=88 | |
6 | - | |
7 | - | |
8 | -# Vector features file | |
9 | -VECTOR_FILE_MASSEFFECT="data/xvectors.txt" | |
10 | - | |
11 | - | |
12 | -# Dirs | |
13 | -EXP_DIR="exp/kmeans_euclidian/xvectors/${k}/${kmean}" | |
14 | -CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl" | |
15 | - | |
16 | - | |
17 | -# Output dirs | |
18 | -OUTFILE_MASSEFFECT="data/xvectors/saved_clustered/masseffect_clustered_xvectors_${k}_${kmean}.txt" | |
19 | - | |
20 | -python3 bin/extract_kmeans.py "${CLUSTERING}" \ | |
21 | - "${VECTOR_FILE_MASSEFFECT}" \ | |
22 | - --outfile "$OUTFILE_MASSEFFECT" |
rm-unused-files.sh
run-clustering.sh
run-measures.sh
... | ... | @@ -2,7 +2,9 @@ |
2 | 2 | # quelques petites commandes que l'on souhaite |
3 | 3 | # tester. |
4 | 4 | |
5 | -OUTDIR="exp/kmeans_euclidian/teacher-pvector-1" | |
5 | +set -e | |
6 | + | |
7 | +OUTDIR="exp/kmeans_euclidian/ivectors" | |
6 | 8 | EXP_DIR=${OUTDIR} |
7 | 9 | DATADIR="data" |
8 | 10 | NEW_LSTDIR="${OUTDIR}/lst" |
... | ... | @@ -22,8 +24,8 @@ |
22 | 24 | |
23 | 25 | for kfold in {1..4} |
24 | 26 | do |
25 | - pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" | |
26 | - VECTOR_FILE=$pvector_file | |
27 | + #pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" | |
28 | + VECTOR_FILE="${DATADIR}/ivectors.txt" | |
27 | 29 | lst_dir="${DATADIR}/pvectors_1rst/lst" |
28 | 30 | output_kfold="${OUTDIR}/${kfold}" |
29 | 31 | |
30 | 32 | |
31 | 33 | |
32 | 34 | |
... | ... | @@ -61,58 +63,19 @@ |
61 | 63 | do |
62 | 64 | echo "Kmeans Measuring and ploting - ${k}" |
63 | 65 | |
64 | - SUB_EXP_DIR="${EXP_DIR}/${kfold}/${k}" | |
66 | + SUB_EXP_DIR="${EXP_DIR}/${kfold}/${k}" | |
65 | 67 | |
66 | - # -- EXTRACT CLUSTERING LABELS | |
67 | - python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ | |
68 | - "${VECTOR_FILE}" \ | |
69 | - --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" | |
68 | + # -- EXTRACT CLUSTERING LABELS | |
69 | + python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ | |
70 | + "${VECTOR_FILE}" \ | |
71 | + --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" | |
70 | 72 | |
71 | - # -- MEASURES AND PLOT WITH RESPECT TO LANG VAR | |
72 | - # Measures | |
73 | - python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
74 | - "${METAS_LANG}" \ | |
75 | - "${TRAIN_LST}" \ | |
76 | - "${VAL_LST}" \ | |
77 | - --outfile "${SUB_EXP_DIR}/measures_lang.json" | |
73 | + # -- MEASURES AND PLOT | |
74 | + source steps/measure_clustering_char.sh | |
75 | + source steps/measure_clustering_type.sh | |
76 | + source steps/measure_clustering_lang.sh | |
78 | 77 | |
79 | - # This script plot the count matrix of the train set | |
80 | - python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
81 | - "${METAS_LANG}" \ | |
82 | - "${TRAIN_LST}" \ | |
83 | - --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf" | |
84 | - | |
85 | - # This script plot the count matrix of the validation set | |
86 | - python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
87 | - "${METAS_LANG}" \ | |
88 | - "${VAL_LST}" \ | |
89 | - --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf" | |
90 | - | |
91 | - rm ${SUB_EXP_DIR}/clustered_${k}.txt | |
92 | - #python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \ | |
93 | - # "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \ | |
94 | - # "${lst_dir}/val_${kfold}.lst" \ | |
95 | - # --outfile "${output_kfold}/${k}/measures_type.json" | |
96 | - | |
97 | - # This script plot the count matrix of the train set | |
98 | - #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | |
99 | - # ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \ | |
100 | - # --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf | |
101 | - | |
102 | - # This script plot the count matrix of the validation set | |
103 | - #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | |
104 | - # ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \ | |
105 | - # --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf | |
106 | - | |
107 | - # This script plot the count matrix of the train set | |
108 | - #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | |
109 | - # ${pvector_file} ${lst_dir}/train_${kfold}.lst \ | |
110 | - # --outfile ${output_kfold}/${k}/train_count_matrix.pdf | |
111 | - | |
112 | - # This script plot the count matrix of the validation set | |
113 | - #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | |
114 | - # ${pvector_file} ${lst_dir}/val_${kfold}.lst \ | |
115 | - # --outfile ${output_kfold}/${k}/val_count_matrix.pdf | |
78 | + rm ${SUB_EXP_DIR}/clustered_${k}.txt | |
116 | 79 | done |
117 | 80 | done |
run-skyrim.sh
1 | +python bin/cluster_kmeans.py ../data/skyrim/skyrim_ivectors.txt ../data/skyrim/skyrim.lst exp/kmeans_euclidian_skyrim/ivectors/ --kmin 1 --kmax 100 |
run.sh
... | ... | @@ -31,7 +31,17 @@ |
31 | 31 | fi |
32 | 32 | |
33 | 33 | |
34 | +if [ -z "$METAS_CHARACTER" ] | |
35 | +then | |
36 | + METAS_CHARACTER="${DATADIR}/masseffect.lst" | |
37 | +fi | |
34 | 38 | |
39 | + | |
40 | +if [ -z "$CHAR_INFO" ] | |
41 | +then | |
42 | + CHAR_INFO="${DATADIR}/character_information.csv" | |
43 | +fi | |
44 | + | |
35 | 45 | # -- MAKE DIRECTORIES |
36 | 46 | if [ ! -d "$OUTDIR" ]; |
37 | 47 | then |
... | ... | @@ -59,7 +69,6 @@ |
59 | 69 | for kfold in $(seq ${MIN_KFOLD} ${MAX_KFOLD}) |
60 | 70 | do |
61 | 71 | # Some usefull variable |
62 | - CHAR_INFO="${DATADIR}/character_information.csv" | |
63 | 72 | TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst" |
64 | 73 | VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst" |
65 | 74 | TRAIN_LANG_LST="${NEW_LSTDIR}/train_${kfold}_lang.lst" |
66 | 75 | |
... | ... | @@ -71,11 +80,10 @@ |
71 | 80 | VECTOR_FILE="${VECTOR_FILES_BEGIN}_${kfold}${VECTOR_FILES_END}" |
72 | 81 | fi |
73 | 82 | |
74 | - TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst" | |
75 | - VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst" | |
83 | + TRAIN_LST="${MOTHER_LST_DIR}/lst/train_${kfold}.lst" | |
84 | + VAL_LST="${MOTHER_LST_DIR}/lst/val_${kfold}.lst" | |
76 | 85 | EXP_DIR="${OUTDIR}/${kfold}" |
77 | 86 | METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst" |
78 | - METAS_CHARACTER="${DATADIR}/masseffect.lst" | |
79 | 87 | METAS_LANG="${NEW_LSTDIR}/metas_${kfold}_lang.lst" |
80 | 88 | |
81 | 89 |
run_kfold.sh
run_without_kfold.sh
1 | + | |
2 | +for k in $(seq ${KMIN} 1 ${KMAX}) | |
3 | +do | |
4 | + SUB_EXP_DIR="${EXP_DIR}/${k}" | |
5 | + | |
6 | + # -- EXTRACT KMEANS VALUES | |
7 | + echo "Kmeans Measuring and extraction - ${k}" | |
8 | + python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ | |
9 | + "${VECTOR_FILE}" \ | |
10 | + --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" | |
11 | + | |
12 | + python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
13 | + "${METAS_CHARACTER}" \ | |
14 | + "${TRAIN_LST}" \ | |
15 | + "${VAL_LST}" \ | |
16 | + --outfile "${SUB_EXP_DIR}/measures.json" |
steps/extract_cluster_file.sh
1 | + | |
2 | +for kfold in `seq 1 4` | |
3 | +do | |
4 | + source $1 | |
5 | + vector_file=${VECTOR_FILE} | |
6 | + echo "kfold: $kfold" | |
7 | + for kmean in `seq 2 100` | |
8 | + do | |
9 | + echo "kmean: $kmean" | |
10 | + exp_dir="${OUTDIR}/${kfold}/${kmean}" | |
11 | + clustering="${exp_dir}/clustering_${kmean}.pkl" | |
12 | + save_loc="${exp_dir}" | |
13 | + saved_txt="${save_loc}/masseffect_clustered.txt" | |
14 | + saved_lst="${save_loc}/masseffect_clustered.lst" | |
15 | + | |
16 | + python3 bin/extract_kmeans.py "${clustering}" \ | |
17 | + "${vector_file}" \ | |
18 | + --outfile "${saved_txt}" | |
19 | + | |
20 | + cat ${saved_txt} | cut -d" " -f1 > ${saved_lst} | |
21 | + | |
22 | + python3 bin/replace-features.py "${ORIGINAL_VECTOR_FILE}" "${saved_txt}" | |
23 | + done | |
24 | +done |
steps/extract_cluster_file_skyrim.sh
1 | + | |
2 | +source $1 | |
3 | +vector_file=${VECTOR_FILE} | |
4 | +echo "kfold: $kfold" | |
5 | +for kmean in `seq 2 100` | |
6 | +do | |
7 | + echo "kmean: $kmean" | |
8 | + exp_dir="${OUTDIR}/${kmean}" | |
9 | + clustering="${exp_dir}/clustering_${kmean}.pkl" | |
10 | + save_loc="${exp_dir}" | |
11 | + saved_txt="${save_loc}/masseffect_clustered.txt" | |
12 | + saved_lst="${save_loc}/masseffect_clustered.lst" | |
13 | + | |
14 | + python3 bin/extract_kmeans.py "${clustering}" \ | |
15 | + "${vector_file}" \ | |
16 | + --outfile "${saved_txt}" | |
17 | + | |
18 | + cat ${saved_txt} | cut -d" " -f1 > ${saved_lst} | |
19 | + | |
20 | + python3 bin/replace-features.py "${ORIGINAL_VECTOR_FILE}" "${saved_txt}" | |
21 | +done |
steps/extract_language_lst.sh
1 | +DATADIR="data" | |
2 | +OUTDIR="exp/kmeans_euclidian/ivectors" | |
3 | +NEW_LSTDIR="${OUTDIR}/lst" | |
4 | + | |
5 | +TRAIN_LST=${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst | |
6 | +VAL_LST=${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst | |
7 | +TRAIN_LANG_LST=${NEW_LSTDIR}/train_${kfold}_lang.lst | |
8 | +VAL_LANG_LST=${NEW_LSTDIR}/val_${kfold}_lang.lst | |
9 | +METAS_LANG=${NEW_LSTDIR}/metas_${kfold}_lang.lst | |
10 | + | |
11 | + | |
12 | +awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST} | |
13 | +echo "VAL EXTRACT LANGUAGE INFO DONE" | |
14 | +awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST} | |
15 | +echo "TRAIN EXTRACT LANGUAGE INFO DONE" | |
16 | +cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}" | |
17 | +echo "GLOBAL EXTRACT LANGUAGE INFO DONE" |
steps/measure_clustering_char.sh
1 | + | |
2 | +python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \ | |
3 | + "${lst_dir}/trainval_${kfold}.lst" "${lst_dir}/train_${kfold}.lst" \ | |
4 | + "${lst_dir}/val_${kfold}.lst" \ | |
5 | + --outfile "${output_kfold}/${k}/measures.json" | |
6 | + | |
7 | + | |
8 | +# This script plot the count matrix of the train set | |
9 | +python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
10 | + "${lst_dir}/train_${kfold}.lst" \ | |
11 | + "${lst_dir}/train_${kfold}.lst" \ | |
12 | + --outfile "${SUB_EXP_DIR}/train_count_matrix.pdf" | |
13 | + | |
14 | +# This script plot the count matrix of the validation set | |
15 | +python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
16 | + "${lst_dir}/val_${kfold}.lst" \ | |
17 | + "${lst_dir}/val_${kfold}.lst" \ | |
18 | + --outfile "${SUB_EXP_DIR}/val_count_matrix.pdf" |
steps/measure_clustering_lang.sh
1 | + | |
2 | +python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
3 | + "${METAS_LANG}" \ | |
4 | + "${TRAIN_LST}" \ | |
5 | + "${VAL_LST}" \ | |
6 | + --outfile "${SUB_EXP_DIR}/measures_lang.json" | |
7 | + | |
8 | +# This script plot the count matrix of the train set | |
9 | +python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
10 | + "${METAS_LANG}" \ | |
11 | + "${TRAIN_LST}" \ | |
12 | + --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf" | |
13 | + | |
14 | +# This script plot the count matrix of the validation set | |
15 | +python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | |
16 | + "${METAS_LANG}" \ | |
17 | + "${VAL_LST}" \ | |
18 | + --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf" |
steps/measure_clustering_type.sh
1 | +python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \ | |
2 | + "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \ | |
3 | + "${lst_dir}/val_${kfold}.lst" \ | |
4 | + --outfile "${output_kfold}/${k}/measures_type.json" | |
5 | + | |
6 | +# This script plot the count matrix of the train set | |
7 | +python3 bin/plot-count-matrix.py "${output_kfold}/${k}/clustered_${k}.txt" \ | |
8 | + "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \ | |
9 | + --outfile "${output_kfold}/${k}/train_count_matrix_type.pdf" | |
10 | + | |
11 | +# This script plot the count matrix of the validation set | |
12 | +python3 bin/plot-count-matrix.py "${output_kfold}/${k}/clustered_${k}.txt" \ | |
13 | + "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/val_${kfold}.lst" \ | |
14 | + --outfile "${output_kfold}/${k}/val_count_matrix_type.pdf" | |
15 | + |
steps/save_clusters_file.sh
1 | + | |
2 | +vector_file="data/xvectors.txt" | |
3 | + | |
4 | +for kfold in `seq 1 4` | |
5 | +do | |
6 | + echo "kfold: $kfold" | |
7 | + for kmean in `seq 2 100` | |
8 | + do | |
9 | + echo "kmean: $kmean" | |
10 | + exp_dir="exp/kmeans_euclidian/xvectors/${kfold}/${kmean}" | |
11 | + clustering="${exp_dir}/clustering_${kmean}.pkl" | |
12 | + save_loc="data/xvectors/saved_clustered/" | |
13 | + saved_txt="${save_loc}/masseffect_clustered_xvectors_${kfold}_${kmean}.txt" | |
14 | + saved_lst="${save_loc}/masseffect_clustered_xvectors_${kfold}_${kmean}.lst" | |
15 | + | |
16 | + python3 bin/extract_kmeans.py "${clustering}" \ | |
17 | + "${vector_file}" \ | |
18 | + --outfile "${saved_txt}" | |
19 | + | |
20 | + cat ${saved_txt} | cut -d" " -f1 > ${saved_lst} | |
21 | + done | |
22 | +done |
utils/extract-labels.sh
1 | + | |
2 | + | |
3 | +# Number of set | |
4 | +k=4 | |
5 | +kmean=88 | |
6 | + | |
7 | + | |
8 | +# Vector features file | |
9 | +VECTOR_FILE_MASSEFFECT="data/xvectors.txt" | |
10 | + | |
11 | + | |
12 | +# Dirs | |
13 | +EXP_DIR="exp/kmeans_euclidian/xvectors/${k}/${kmean}" | |
14 | +CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl" | |
15 | + | |
16 | + | |
17 | +# Output dirs | |
18 | +OUTFILE_MASSEFFECT="data/xvectors/saved_clustered/masseffect_clustered_xvectors_${k}_${kmean}.txt" | |
19 | + | |
20 | +python3 bin/extract_kmeans.py "${CLUSTERING}" \ | |
21 | + "${VECTOR_FILE_MASSEFFECT}" \ | |
22 | + --outfile "$OUTFILE_MASSEFFECT" |
utils/rm-unused-files.sh
utils/transform_exp_to_kd.sh
1 | + | |
2 | +# -- DESCRIPTION -- | |
3 | +# | |
4 | +# This script aims to transform data in a shape that is | |
5 | +# usable mainly by knowledge distillation scripts. | |
6 | +# | |
7 | +# Firstly, it extracts clustering labels | |
8 | +# then change features with the given one | |
9 | +# and finally generate a list file. | |
10 | +# | |
11 | +# The pair features files and list file will be usable | |
12 | +# by the knowledge distillation system. | |
13 | +# -------------------- | |
14 | + | |
15 | + | |
16 | +# -- CONFIGURATION -- | |
17 | +# Configuration error | |
18 | +set -e | |
19 | + | |
20 | +# KFOLD config | |
21 | +MIN_KFOLD=1 | |
22 | +MAX_KFOLD=4 | |
23 | + | |
24 | +# KMEAN config | |
25 | +MIN_KMEAN=2 | |
26 | +MAX_KMEAN=100 | |
27 | + | |
28 | +# Vector features file | |
29 | +DATADIR="data" | |
30 | +FEATURES_DIR="${DATADIR}/pv_from_xv" | |
31 | +FEATURES_PREFIX="me_pv_teacher" | |
32 | +FEATURES_SUFFIX=".txt" | |
33 | + | |
34 | +EXP_DIR="exp/kmeans_euclidian/pv_from_xv" | |
35 | +VECTOR_FILE_MASSEFFECT="${DATADIR}/xvectors.txt" | |
36 | +OUTDIR="data/pv_from_xv/saved_clustered" | |
37 | + | |
38 | +# -- CREATE DIRECTORIES | |
39 | +# OUTPUT DIRECTORY | |
40 | +if [ ! -d "${OUTDIR}" ] | |
41 | +then | |
42 | + mkdir -p ${OUTDIR} | |
43 | +fi | |
44 | + | |
45 | + | |
46 | +# -- FUNCTIONS -- | |
47 | +# Definition of the transform function | |
48 | +function transform() { | |
49 | + # Define subdir variable | |
50 | + local SUB_EXP_DIR="${EXP_DIR}/${k}/${kmean}" | |
51 | + | |
52 | + # Define features file variable | |
53 | + local INITIAL_VECTOR_FILE="${FEATURES_DIR}/${FEATURES_PREFIX}_${k}${FEATURES_SUFFIX}" | |
54 | + | |
55 | + # Information of the current process | |
56 | + echo "[KFOLD, KMEAN]: [${k}, ${kmean}]" | |
57 | + | |
58 | + # Define clustering model variable | |
59 | + local CLUSTERING="${SUB_EXP_DIR}/clustering_${kmean}.pkl" | |
60 | + | |
61 | + | |
62 | + # Define output file | |
63 | + local OUTFILE_MASSEFFECT="${OUTDIR}/masseffect_clustered_${k}_${kmean}.txt" | |
64 | + | |
65 | + # Extracting clustering labels | |
66 | + echo "Extracting clustering labels" | |
67 | + python3 bin/extract_kmeans.py "${CLUSTERING}" \ | |
68 | + "${INITIAL_VECTOR_FILE}" \ | |
69 | + --outfile "${OUTFILE_MASSEFFECT}" | |
70 | + | |
71 | + # Changing features | |
72 | + echo "Changing features" | |
73 | + python bin/replace-features.py ${VECTOR_FILE_MASSEFFECT} ${OUTFILE_MASSEFFECT} | |
74 | + | |
75 | + # Extracting list file | |
76 | + cut -d' ' -f1 ${OUTFILE_MASSEFFECT} > "${OUTDIR}/masseffect_clustered_${k}_${kmean}.lst" | |
77 | +} | |
78 | + | |
79 | + | |
80 | +# -- MAIN LOOPS | |
81 | +for k in $(seq ${MIN_KFOLD} ${MAX_KFOLD}) | |
82 | +do | |
83 | + for kmean in $(seq ${MIN_KMEAN} ${MAX_KMEAN}) | |
84 | + do | |
85 | + transform | |
86 | + done | |
87 | +done |