Commit e63ab06fc786597258d861e68c335de9e2afceb4
1 parent
c95c2bf75c
Exists in
master
New organisation of the project
Showing 44 changed files with 544 additions and 210 deletions Inline Diff
- README.md
- bin/regroup-measures.py
- bin/replace-features.py
- config/archives/ivector_config.sh
- config/archives/pv_from_xv_config.sh
- config/archives/pvector_config.sh
- config/archives/pvector_layer1_config.sh
- config/archives/pvector_layer2_config.sh
- config/archives/pvector_layer3_config.sh
- config/archives/pvector_layer4_config.sh
- config/archives/xvector_config.sh
- config/config_iv.sh
- config/config_iv_skyrim.sh
- config/config_pv_from_iv.sh
- config/config_pv_from_xv.sh
- config/config_without_kfold_iv.sh
- config/config_xv.sh
- config/ivector_config.sh
- config/pv_from_xv_config.sh
- config/pvector_config.sh
- config/pvector_layer1_config.sh
- config/pvector_layer2_config.sh
- config/pvector_layer3_config.sh
- config/pvector_layer4_config.sh
- config/xvector_config.sh
- extract-labels-pv-from-xv.sh
- extract-labels.sh
- rm-unused-files.sh
- run-clustering.sh
- run-measures.sh
- run-skyrim.sh
- run.sh
- run_kfold.sh
- run_without_kfold.sh
- steps/extract_cluster_file.sh
- steps/extract_cluster_file_skyrim.sh
- steps/extract_language_lst.sh
- steps/measure_clustering_char.sh
- steps/measure_clustering_lang.sh
- steps/measure_clustering_type.sh
- steps/save_clusters_file.sh
- utils/extract-labels.sh
- utils/rm-unused-files.sh
- utils/transform_exp_to_kd.sh
README.md
1 | # Clustering | 1 | # Clustering |
2 | A repository where i put everything dealing with clustering algorithms. | 2 | A repository where i put everything dealing with clustering algorithms. |
3 | 3 | ||
4 | # How to use | ||
5 | You can run directly the run.sh script if you want. You just need data. | ||
6 | |||
7 | You can use some scripts in utils tool, but run these scripts from the root directory "clustering/". | ||
8 | |||
4 | # TODO | 9 | # TODO |
5 | - Organiser les différentes listes de données pour mes expériences | 10 | - Organiser les différentes listes de données pour mes expériences |
6 | - Create a data file example | 11 | - Create a data file example |
7 | 12 | ||
8 | # Data | 13 | # Data |
9 | 14 | ||
10 | # File format | 15 | # File format |
11 | 16 |
bin/regroup-measures.py
1 | ''' | 1 | ''' |
2 | Regroup results into one file and a plot. | 2 | Regroup results into one file and a plot. |
3 | TODO: Mettre en valeur les valeurs maximales | 3 | TODO: Mettre en valeur les valeurs maximales |
4 | TODO: Sauvegarder les valeurs quelques part pour qu'on puisse facilement les retrouver. | 4 | TODO: Sauvegarder les valeurs quelques part pour qu'on puisse facilement les retrouver. |
5 | 5 | ||
6 | ''' | 6 | ''' |
7 | 7 | ||
8 | import numpy as np | 8 | import numpy as np |
9 | import matplotlib.pyplot as plt | 9 | import matplotlib.pyplot as plt |
10 | import argparse | 10 | import argparse |
11 | import os | 11 | import os |
12 | import json | 12 | import json |
13 | 13 | ||
14 | 14 | ||
15 | def plot_values_clusters(values, title, xlabel, ylabel): | 15 | def plot_values_clusters(values, title, xlabel, ylabel): |
16 | values = np.asarray(values) | 16 | values = np.asarray(values) |
17 | x = np.arange(len(values)) + 2 | 17 | x = np.arange(len(values)) + 2 |
18 | x_ticks = np.arange(len(values), step=10) + 2 | 18 | x_ticks = np.arange(len(values), step=10) + 2 |
19 | y = values | 19 | y = values |
20 | plt.scatter(x, y, s=1) | 20 | plt.scatter(x, y, s=1) |
21 | plt.xticks(x_ticks) | 21 | plt.xticks(x_ticks) |
22 | plt.title(title) | 22 | plt.title(title) |
23 | plt.xlabel(xlabel) | 23 | plt.xlabel(xlabel) |
24 | plt.ylabel(ylabel) | 24 | plt.ylabel(ylabel) |
25 | 25 | ||
26 | 26 | ||
27 | def save_plot(filepath): | 27 | def save_plot(filepath): |
28 | plt.savefig(filepath) | 28 | plt.savefig(filepath) |
29 | plt.close() | 29 | plt.close() |
30 | 30 | ||
31 | 31 | ||
32 | def save_results(outfile, measures, titles): | 32 | def save_results(outfile, measures, titles): |
33 | with open(outfile, "w") as f: | 33 | with open(outfile, "w") as f: |
34 | f.write(",".join(titles) + "\n") | 34 | f.write(",".join(titles) + "\n") |
35 | n = len(measures[0]) | 35 | n = len(measures[0]) |
36 | for i in range(n): | 36 | for i in range(n): |
37 | f.write(",".join([str(ms[i]) for ms in measures]) + "\n") | 37 | f.write(",".join([str(ms[i]) for ms in measures]) + "\n") |
38 | 38 | ||
39 | 39 | ||
40 | # -- PARSER | 40 | # -- PARSER |
41 | parser = argparse.ArgumentParser(description="") | 41 | parser = argparse.ArgumentParser(description="") |
42 | parser.add_argument("expdir", type=str, help="Directory of experiment") | 42 | parser.add_argument("expdir", type=str, help="Directory of experiment") |
43 | parser.add_argument("--nkfold", type=int, default=4, help="number of kfold") | ||
44 | parser.add_argument("--nkfoldmin", type=int, default=1, help="Begin with this numero of kfold") | ||
43 | parser.add_argument("--measurefile", type=str, default="measures.json", | 45 | parser.add_argument("--measurefile", type=str, default="measures.json", |
44 | help="Measure file it searchs in folders") | 46 | help="Measure file it searchs in folders") |
45 | parser.add_argument("--suffix", type=str, default="", | 47 | parser.add_argument("--suffix", type=str, default="", |
46 | help="suffix of saved files") | 48 | help="suffix of saved files") |
47 | 49 | ||
48 | args = parser.parse_args() | 50 | args = parser.parse_args() |
49 | EXP_DIR = args.expdir | 51 | EXP_DIR = args.expdir |
50 | MEASURE_FILE = args.measurefile | 52 | MEASURE_FILE = args.measurefile |
51 | SUFFIX = args.suffix | 53 | SUFFIX = args.suffix |
54 | MAX_KFOLD = args.nkfold | ||
55 | MIN_KFOLD = args.nkfoldmin | ||
52 | 56 | ||
53 | # EXP_DIR="exp/kmeans_teacher_1/pvector-1" | 57 | # EXP_DIR="exp/kmeans_teacher_1/pvector-1" |
54 | RESULTS_DIR = os.path.join(EXP_DIR, "res") | 58 | RESULTS_DIR = os.path.join(EXP_DIR, "res") |
55 | 59 | ||
56 | # -- CONFIG | 60 | # -- CONFIG |
57 | kmin = 2 | 61 | kmin = 2 |
58 | kmax = 100 | 62 | kmax = 100 |
59 | 63 | ||
60 | 64 | ||
61 | # -- CREATE FOLDER | 65 | # -- CREATE FOLDER |
62 | if not os.path.exists(RESULTS_DIR): | 66 | if not os.path.exists(RESULTS_DIR): |
63 | os.makedirs(RESULTS_DIR) | 67 | os.makedirs(RESULTS_DIR) |
64 | 68 | ||
65 | # -- BEGIN REGROUPMENT | 69 | # -- BEGIN REGROUPMENT |
66 | 70 | ||
67 | subsets = ["train", "val"] | 71 | subsets = ["train", "val"] |
68 | 72 | ||
69 | disequilibriums = [] | 73 | disequilibriums = [] |
70 | 74 | ||
71 | 75 | ||
72 | def init_measures(): | 76 | def init_measures(): |
73 | measures = {} | 77 | measures = {} |
74 | 78 | ||
75 | for subset in subsets: | 79 | for subset in subsets: |
76 | measures[subset] = {} | 80 | measures[subset] = {} |
77 | measures[subset]["entropy"] = [] | 81 | measures[subset]["entropy"] = [] |
78 | measures[subset]["vscore"] = [] | 82 | measures[subset]["vscore"] = [] |
79 | measures[subset]["homogeneity"] = [] | 83 | measures[subset]["homogeneity"] = [] |
80 | measures[subset]["completeness"] = [] | 84 | measures[subset]["completeness"] = [] |
81 | return measures | 85 | return measures |
82 | 86 | ||
83 | 87 | ||
84 | measures = init_measures() | 88 | measures = init_measures() |
85 | 89 | ||
86 | for kfold in range(1, 5): | 90 | for kfold in range(MIN_KFOLD, MAX_KFOLD + 1): |
87 | print("Regrouping on kfold: " + str(kfold)) | 91 | print("Regrouping on kfold: " + str(kfold)) |
88 | # -- REGROUP MEASURES INTO LISTS | 92 | # -- REGROUP MEASURES INTO LISTS |
89 | for k in range(kmin, kmax+1): | 93 | for k in range(kmin, kmax+1): |
90 | measures_file = os.path.join(EXP_DIR, str(kfold), str(k), MEASURE_FILE) | 94 | measures_file = os.path.join(EXP_DIR, str(kfold), str(k), MEASURE_FILE) |
91 | with open(measures_file, 'r') as f: | 95 | with open(measures_file, 'r') as f: |
92 | meas_data = json.load(f) | 96 | meas_data = json.load(f) |
93 | disequilibriums.append(meas_data["disequilibrium"]) | 97 | disequilibriums.append(meas_data["disequilibrium"]) |
94 | for subset in subsets: | 98 | for subset in subsets: |
95 | measures[subset]["entropy"].append( | 99 | measures[subset]["entropy"].append( |
96 | meas_data[subset]["entropy"]) | 100 | meas_data[subset]["entropy"]) |
97 | measures[subset]["vscore"].append( | 101 | measures[subset]["vscore"].append( |
98 | meas_data[subset]["vscore"]) | 102 | meas_data[subset]["vscore"]) |
99 | measures[subset]["homogeneity"].append( | 103 | measures[subset]["homogeneity"].append( |
100 | meas_data[subset]["homogeneity"]) | 104 | meas_data[subset]["homogeneity"]) |
101 | measures[subset]["completeness"].append( | 105 | measures[subset]["completeness"].append( |
102 | meas_data[subset]["completeness"]) | 106 | meas_data[subset]["completeness"]) |
103 | 107 | ||
104 | # -- PLOT AND SAVE MEASURES FOR A SPECIFIC SUBSET | 108 | # -- PLOT AND SAVE MEASURES FOR A SPECIFIC SUBSET |
105 | for subset in subsets: | 109 | for subset in subsets: |
106 | # Plot all measures | 110 | # Plot all measures |
107 | outf = "measures_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".pdf" | 111 | outf = "measures_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".pdf" |
108 | 112 | ||
109 | fig = plt.figure(1) | 113 | fig = plt.figure(1) |
110 | for i, measure in enumerate(measures[subset]): | 114 | for i, measure in enumerate(measures[subset]): |
111 | 115 | ||
112 | plt.subplot(220 + i + 1) | 116 | plt.subplot(220 + i + 1) |
113 | 117 | ||
114 | plot_values_clusters( | 118 | plot_values_clusters( |
115 | measures[subset][measure], | 119 | measures[subset][measure], |
116 | measure.capitalize() + " " + str(subset) + " set " + str(kfold), | 120 | measure.capitalize() + " " + str(subset) + " set " + str(kfold), |
117 | "N clusters", | 121 | "N clusters", |
118 | measure.capitalize()) | 122 | measure.capitalize()) |
119 | plt.subplots_adjust(hspace=0.5, wspace=0.3) | 123 | plt.subplots_adjust(hspace=0.5, wspace=0.3) |
120 | save_plot(os.path.join(RESULTS_DIR, outf)) | 124 | save_plot(os.path.join(RESULTS_DIR, outf)) |
121 | 125 | ||
122 | # Save all measures on a csv file | 126 | # Save all measures on a csv file |
123 | save_results( | 127 | save_results( |
124 | os.path.join(RESULTS_DIR, "measures_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".csv"), | 128 | os.path.join(RESULTS_DIR, "measures_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".csv"), |
125 | [ | 129 | [ |
126 | measures[subset]["entropy"], | 130 | measures[subset]["entropy"], |
127 | measures[subset]["homogeneity"], | 131 | measures[subset]["homogeneity"], |
128 | measures[subset]["completeness"], | 132 | measures[subset]["completeness"], |
129 | measures[subset]["vscore"] | 133 | measures[subset]["vscore"] |
130 | ], | 134 | ], |
131 | [ | 135 | [ |
132 | "entropy", | 136 | "entropy", |
133 | "homogeneity", | 137 | "homogeneity", |
134 | "completeness", | 138 | "completeness", |
135 | "vscore" | 139 | "vscore" |
136 | ] | 140 | ] |
137 | ) | 141 | ) |
138 | 142 | ||
139 | # PLOT AND SAVE FOR DISEQUILIBRIUM | 143 | # PLOT AND SAVE FOR DISEQUILIBRIUM |
140 | plot_values_clusters( | 144 | plot_values_clusters( |
141 | disequilibriums, | 145 | disequilibriums, |
142 | "Disequilibrium set " + str(kfold), | 146 | "Disequilibrium set " + str(kfold), |
143 | "N clusters", | 147 | "N clusters", |
144 | "Disequilibrium") | 148 | "Disequilibrium") |
145 | save_plot(os.path.join(RESULTS_DIR, "disequilibrium_" + str(kfold) + str(SUFFIX) + ".pdf")) | 149 | save_plot(os.path.join(RESULTS_DIR, "disequilibrium_" + str(kfold) + str(SUFFIX) + ".pdf")) |
146 | 150 | ||
147 | save_results( | 151 | save_results( |
148 | os.path.join(RESULTS_DIR, "disequilibrium_" + str(kfold) + str(SUFFIX) + ".csv"), | 152 | os.path.join(RESULTS_DIR, "disequilibrium_" + str(kfold) + str(SUFFIX) + ".csv"), |
149 | [disequilibriums], | 153 | [disequilibriums], |
150 | ["disequilibrium"]) | 154 | ["disequilibrium"]) |
151 | 155 | ||
152 | measures = init_measures() | 156 | measures = init_measures() |
153 | disequilibriums = [] | 157 | disequilibriums = [] |
154 | 158 |
bin/replace-features.py
File was created | 1 | ||
2 | import argparse | ||
3 | |||
4 | from data import read_file, index_by_id, write_line | ||
5 | |||
6 | # -- ARGPARSE | ||
7 | parser = argparse.ArgumentParser( | ||
8 | description="Replace features with file from to file to") | ||
9 | parser.add_argument("fromfile", type=str, help="From list or features file") | ||
10 | parser.add_argument("tofile", type=str, help="Features of 'from' saved into this file.") | ||
11 | |||
12 | args = parser.parse_args() | ||
13 | FROM = args.fromfile | ||
14 | TO = args.tofile | ||
15 | |||
16 | |||
17 | # -- READ AND INDEX FILES | ||
18 | from_data = read_file(FROM) | ||
19 | from_by_id = index_by_id(from_data) | ||
20 | |||
21 | to_data = read_file(TO) | ||
22 | |||
23 | with open(TO, "w") as f: | ||
24 | for line in to_data: | ||
25 | metas = line[0] | ||
26 | features = from_by_id[metas[0]][metas[3]][1] | ||
27 | write_line(metas, features, f) | ||
28 | |||
29 |
config/archives/ivector_config.sh
File was created | 1 | OUTDIR="exp/kmeans_euclidian/ivectors" | |
2 | DATADIR="data" | ||
3 | NEW_LSTDIR="${OUTDIR}/lst" | ||
4 | |||
5 | VECTOR_FILE="data/ivectors.txt" # To specify if there's only one | ||
6 | VECTOR_FILES_ONE=true # Specify there's only one file | ||
7 | |||
8 | KMIN=2 | ||
9 | KMAX=100 | ||
10 |
config/archives/pv_from_xv_config.sh
File was created | 1 | ||
2 | # Framework configuration | ||
3 | OUTDIR="exp/kmeans_euclidian/pv_from_xv" | ||
4 | DATADIR="data" | ||
5 | NEW_LSTDIR="${OUTDIR}/lst" | ||
6 | |||
7 | VECTOR_FILES_BEGIN="${DATADIR}/pv_from_xv/me_pv_teacher" | ||
8 | VECTOR_FILES_END=".txt" | ||
9 | VECTOR_FILE="" # To specify if there's only one | ||
10 | VECTOR_FILES_ONE=false # Specify there's only one file | ||
11 | |||
12 | KMIN=2 | ||
13 | KMAX=100 | ||
14 |
config/archives/pvector_config.sh
File was created | 1 | ||
2 | OUTDIR="exp/kmeans_euclidian/teacher-pvector-1" | ||
3 | DATADIR="data" | ||
4 | NEW_LSTDIR="${OUTDIR}/lst" | ||
5 | |||
6 | VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" | ||
7 | VECTOR_FILES_END=".txt" | ||
8 | VECTOR_FILE="" # To specify if there's only one | ||
9 | VECTOR_FILES_ONE=false # Specify there's only one file | ||
10 | |||
11 | KMIN=2 | ||
12 | KMAX=100 | ||
13 |
config/archives/pvector_layer1_config.sh
File was created | 1 | OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer1" | |
2 | DATADIR="data" | ||
3 | NEW_LSTDIR="${OUTDIR}/lst" | ||
4 | |||
5 | VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_1" | ||
6 | VECTOR_FILES_END=".txt" | ||
7 | VECTOR_FILE="" # To specify if there's only one | ||
8 | VECTOR_FILES_ONE=false # Specify there's only one file | ||
9 | |||
10 | KMIN=2 | ||
11 | KMAX=100 | ||
12 |
config/archives/pvector_layer2_config.sh
File was created | 1 | OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer2" | |
2 | DATADIR="data" | ||
3 | NEW_LSTDIR="${OUTDIR}/lst" | ||
4 | |||
5 | VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_2" | ||
6 | VECTOR_FILES_END=".txt" | ||
7 | VECTOR_FILE="" # To specify if there's only one | ||
8 | VECTOR_FILES_ONE=false # Specify there's only one file | ||
9 | |||
10 | KMIN=2 | ||
11 | KMAX=100 | ||
12 |
config/archives/pvector_layer3_config.sh
File was created | 1 | OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer3" | |
2 | DATADIR="data" | ||
3 | NEW_LSTDIR="${OUTDIR}/lst" | ||
4 | |||
5 | VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_3" | ||
6 | VECTOR_FILES_END=".txt" | ||
7 | VECTOR_FILE="" # To specify if there's only one | ||
8 | VECTOR_FILES_ONE=false # Specify there's only one file | ||
9 | |||
10 | KMIN=2 | ||
11 | KMAX=100 | ||
12 |
config/archives/pvector_layer4_config.sh
File was created | 1 | OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer4" | |
2 | DATADIR="data" | ||
3 | NEW_LSTDIR="${OUTDIR}/lst" | ||
4 | |||
5 | VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_4" | ||
6 | VECTOR_FILES_END=".txt" | ||
7 | VECTOR_FILE="" # To specify if there's only one | ||
8 | VECTOR_FILES_ONE=false # Specify there's only one file | ||
9 | |||
10 | KMIN=2 | ||
11 | KMAX=100 | ||
12 |
config/archives/xvector_config.sh
File was created | 1 | OUTDIR="exp/kmeans_euclidian/xvectors" | |
2 | DATADIR="data" | ||
3 | NEW_LSTDIR="${OUTDIR}/lst" | ||
4 | |||
5 | VECTOR_FILE="data/xvectors.txt" # To specify if there's only one | ||
6 | VECTOR_FILES_ONE=true # Specify there's only one file | ||
7 | |||
8 | KMIN=2 | ||
9 | KMAX=100 | ||
10 |
config/config_iv.sh
File was created | 1 | OUTDIR="exp/kmeans_euclidian/iv" | |
2 | DATADIR="data" | ||
3 | NEW_LSTDIR="${OUTDIR}/lst" | ||
4 | |||
5 | VECTOR_FILE="data/ivectors.txt" # To specify if there's only one | ||
6 | VECTOR_FILES_ONE=true # Specify there's only one file | ||
7 | |||
8 | METAS_CHARACTER="data/masseffect.lst" | ||
9 | CHAR_INFO="data/masseffect_character_information.csv" | ||
10 | |||
11 | ORIGINAL_VECTOR_FILE="${VECTOR_FILE}" | ||
12 | |||
13 | KMIN=2 | ||
14 | KMAX=100 | ||
15 | |||
16 |
config/config_iv_skyrim.sh
File was created | 1 | OUTDIR="exp/kmeans_euclidian_skyrim/iv" | |
2 | DATADIR="data" | ||
3 | NEW_LSTDIR="${OUTDIR}/lst" | ||
4 | |||
5 | VECTOR_FILE="../data/skyrim/skyrim_ivectors.txt" # To specify if there's only one | ||
6 | VECTOR_FILES_ONE=true # Specify there's only one file | ||
7 | |||
8 | METAS_CHARACTER="../data/skyrim/skyrim.lst" | ||
9 | CHAR_INFO="data/skyrim_character_information.csv" | ||
10 | |||
11 | ORIGINAL_VECTOR_FILE="${VECTOR_FILE}" | ||
12 | |||
13 | KMIN=2 | ||
14 | KMAX=100 | ||
15 | |||
16 |
config/config_pv_from_iv.sh
File was created | 1 | ||
2 | if [ -z "$kfold" ] | ||
3 | then | ||
4 | kfold=1 | ||
5 | fi | ||
6 | |||
7 | if [ -z "${t}" ] | ||
8 | then | ||
9 | t=2.0 | ||
10 | fi | ||
11 | |||
12 | OUTDIR="exp/kmeans_euclidian/pv_from_iv/${kfold}" | ||
13 | DATADIR="data" | ||
14 | MOTHER_LST_DIR="/local_disk/pegasus/laboinfo/mquillot/vocal_similarity_system/data/prot_alpha" | ||
15 | NEW_LSTDIR="${OUTDIR}/lst" | ||
16 | |||
17 | |||
18 | VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/exp/kd_iv/${kfold}/${t}/teacher/masseffect_pvectors.txt" # To specify if there's only one | ||
19 | VECTOR_FILES_ONE=true # Specify there's only one file | ||
20 | ORIGINAL_VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/data/masseffect.txt" | ||
21 | |||
22 | |||
23 | MIN_KFOLD=${kfold} | ||
24 | MAX_KFOLD=${kfold} | ||
25 | |||
26 | KMIN=2 | ||
27 | KMAX=100 | ||
28 |
config/config_pv_from_xv.sh
File was created | 1 | ||
2 | if [ -z "$kfold" ] | ||
3 | then | ||
4 | kfold=1 | ||
5 | fi | ||
6 | |||
7 | if [ -z "${t}" ] | ||
8 | then | ||
9 | t=2.0 | ||
10 | fi | ||
11 | |||
12 | OUTDIR="exp/kmeans_euclidian/pv_from_xv/${kfold}" | ||
13 | DATADIR="data" | ||
14 | MOTHER_LST_DIR="/local_disk/pegasus/laboinfo/mquillot/vocal_similarity_system/data/prot_alpha" | ||
15 | NEW_LSTDIR="${OUTDIR}/lst" | ||
16 | |||
17 | |||
18 | VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/exp/kd_xvectors/${kfold}/${t}/teacher/masseffect_pvectors.txt" # To specify if there's only one | ||
19 | VECTOR_FILES_ONE=true # Specify there's only one file | ||
20 | ORIGINAL_VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/data/masseffect_xvectors.txt" | ||
21 | |||
22 | MIN_KFOLD=${kfold} | ||
23 | MAX_KFOLD=${kfold} | ||
24 | |||
25 | KMIN=2 | ||
26 | KMAX=100 | ||
27 |
config/config_without_kfold_iv.sh
File was created | 1 | OUTDIR="exp/kmeans_euclidian_skyrim/ivectors" | |
2 | DATADIR="data" | ||
3 | NEW_LSTDIR="${OUTDIR}/lst" | ||
4 | |||
5 | LST_FILE="/local_disk/pegasus/laboinfo/mquillot/data/skyrim/skyrim_ivectors.txt" | ||
6 | VECTOR_FILE="data/ivectors.txt" # To specify if there's only one | ||
7 | VECTOR_FILES_ONE=true # Specify there's only one file | ||
8 | |||
9 | WITHOUT_KFOLD="" | ||
10 | KMIN=2 | ||
11 | KMAX=100 | ||
12 | |||
13 | METAS_CHARACTER="" |
config/config_xv.sh
File was created | 1 | OUTDIR="exp/kmeans_euclidian/xv" | |
2 | DATADIR="data" | ||
3 | NEW_LSTDIR="${OUTDIR}/lst" | ||
4 | |||
5 | VECTOR_FILE="data/xvectors.txt" # To specify if there's only one | ||
6 | VECTOR_FILES_ONE=true # Specify there's only one file | ||
7 | |||
8 | ORIGINAL_VECTOR_FILE="${VECTOR_FILE}" | ||
9 | KMIN=2 | ||
10 | KMAX=100 | ||
11 |
config/ivector_config.sh
1 | OUTDIR="exp/kmeans_euclidian/ivectors" | File was deleted | |
2 | DATADIR="data" | ||
3 | NEW_LSTDIR="${OUTDIR}/lst" | ||
4 | |||
5 | VECTOR_FILE="data/ivectors.txt" # To specify if there's only one | ||
6 | VECTOR_FILES_ONE=true # Specify there's only one file | ||
7 | |||
8 | KMIN=2 | ||
9 | KMAX=100 | ||
10 | 1 | OUTDIR="exp/kmeans_euclidian/ivectors" |
config/pv_from_xv_config.sh
1 | File was deleted | ||
2 | # Framework configuration | ||
3 | OUTDIR="exp/kmeans_euclidian/pv_from_xv" | ||
4 | DATADIR="data" | ||
5 | NEW_LSTDIR="${OUTDIR}/lst" | ||
6 | |||
7 | VECTOR_FILES_BEGIN="${DATADIR}/pv_from_xv/me_pv_teacher" | ||
8 | VECTOR_FILES_END=".txt" | ||
9 | VECTOR_FILE="" # To specify if there's only one | ||
10 | VECTOR_FILES_ONE=false # Specify there's only one file | ||
11 | |||
12 | KMIN=2 | ||
13 | KMAX=100 | ||
14 | 1 |
config/pvector_config.sh
1 | File was deleted | ||
2 | OUTDIR="exp/kmeans_euclidian/teacher-pvector-1" | ||
3 | DATADIR="data" | ||
4 | NEW_LSTDIR="${OUTDIR}/lst" | ||
5 | |||
6 | VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" | ||
7 | VECTOR_FILES_END=".txt" | ||
8 | VECTOR_FILE="" # To specify if there's only one | ||
9 | VECTOR_FILES_ONE=false # Specify there's only one file | ||
10 | |||
11 | KMIN=2 | ||
12 | KMAX=100 | ||
13 | 1 |
config/pvector_layer1_config.sh
1 | OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer1" | File was deleted | |
2 | DATADIR="data" | ||
3 | NEW_LSTDIR="${OUTDIR}/lst" | ||
4 | |||
5 | VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_1" | ||
6 | VECTOR_FILES_END=".txt" | ||
7 | VECTOR_FILE="" # To specify if there's only one | ||
8 | VECTOR_FILES_ONE=false # Specify there's only one file | ||
9 | |||
10 | KMIN=2 | ||
11 | KMAX=100 | ||
12 | 1 | OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer1" |
config/pvector_layer2_config.sh
1 | OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer2" | File was deleted | |
2 | DATADIR="data" | ||
3 | NEW_LSTDIR="${OUTDIR}/lst" | ||
4 | |||
5 | VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_2" | ||
6 | VECTOR_FILES_END=".txt" | ||
7 | VECTOR_FILE="" # To specify if there's only one | ||
8 | VECTOR_FILES_ONE=false # Specify there's only one file | ||
9 | |||
10 | KMIN=2 | ||
11 | KMAX=100 | ||
12 | 1 | OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer2" |
config/pvector_layer3_config.sh
1 | OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer3" | File was deleted | |
2 | DATADIR="data" | ||
3 | NEW_LSTDIR="${OUTDIR}/lst" | ||
4 | |||
5 | VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_3" | ||
6 | VECTOR_FILES_END=".txt" | ||
7 | VECTOR_FILE="" # To specify if there's only one | ||
8 | VECTOR_FILES_ONE=false # Specify there's only one file | ||
9 | |||
10 | KMIN=2 | ||
11 | KMAX=100 | ||
12 | 1 | OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer3" |
config/pvector_layer4_config.sh
1 | OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer4" | File was deleted | |
2 | DATADIR="data" | ||
3 | NEW_LSTDIR="${OUTDIR}/lst" | ||
4 | |||
5 | VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_4" | ||
6 | VECTOR_FILES_END=".txt" | ||
7 | VECTOR_FILE="" # To specify if there's only one | ||
8 | VECTOR_FILES_ONE=false # Specify there's only one file | ||
9 | |||
10 | KMIN=2 | ||
11 | KMAX=100 | ||
12 | 1 | OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer4" |
config/xvector_config.sh
1 | OUTDIR="exp/kmeans_euclidian/xvectors" | File was deleted | |
2 | DATADIR="data" | ||
3 | NEW_LSTDIR="${OUTDIR}/lst" | ||
4 | |||
5 | VECTOR_FILE="data/xvectors.txt" # To specify if there's only one | ||
6 | VECTOR_FILES_ONE=true # Specify there's only one file | ||
7 | |||
8 | KMIN=2 | ||
9 | KMAX=100 | ||
10 | 1 | OUTDIR="exp/kmeans_euclidian/xvectors" |
extract-labels-pv-from-xv.sh
1 | File was deleted | ||
2 | |||
3 | # Number of set | ||
4 | k=4 | ||
5 | |||
6 | |||
7 | # Vector features file | ||
8 | DATADIR="data" | ||
9 | |||
10 | VECTOR_FILE_MASSEFFECT="${DATADIR}/xvectors.txt" | ||
11 | |||
12 | for kmean in 12 41 45 50 6 69 72 88 | ||
13 | do | ||
14 | echo "KMEAN: ${kmean}" | ||
15 | # Dirs | ||
16 | EXP_DIR="exp/kmeans_euclidian/pv_from_xv/${k}/${kmean}" | ||
17 | CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl" | ||
18 | |||
19 | |||
20 | # Output dirs | ||
21 | OUTFILE_MASSEFFECT="data/pv_from_xv/saved_clustered/masseffect_clustered_${k}_${kmean}.txt" | ||
22 | echo "Extracting" | ||
23 | python3 bin/extract_kmeans.py "${CLUSTERING}" \ | ||
24 | "${VECTOR_FILE_MASSEFFECT}" \ | ||
25 | --outfile "$OUTFILE_MASSEFFECT" | ||
26 | echo "End extracting" | ||
27 | done | ||
28 | 1 |
extract-labels.sh
1 | File was deleted | ||
2 | |||
3 | # Number of set | ||
4 | k=4 | ||
5 | kmean=88 | ||
6 | |||
7 | |||
8 | # Vector features file | ||
9 | VECTOR_FILE_MASSEFFECT="data/xvectors.txt" | ||
10 | |||
11 | |||
12 | # Dirs | ||
13 | EXP_DIR="exp/kmeans_euclidian/xvectors/${k}/${kmean}" | ||
14 | CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl" | ||
15 | |||
16 | |||
17 | # Output dirs | ||
18 | OUTFILE_MASSEFFECT="data/xvectors/saved_clustered/masseffect_clustered_xvectors_${k}_${kmean}.txt" | ||
19 | |||
20 | python3 bin/extract_kmeans.py "${CLUSTERING}" \ | ||
21 | "${VECTOR_FILE_MASSEFFECT}" \ | ||
22 | --outfile "$OUTFILE_MASSEFFECT" | ||
23 | 1 |
rm-unused-files.sh
1 | File was deleted | ||
2 | if [ $# -eq 1 ] | ||
3 | then | ||
4 | EXP_DIR="$1" | ||
5 | else | ||
6 | echo "Need to have one and only one argument. This argument is the exp directory." | ||
7 | exit 1 | ||
8 | fi | ||
9 | |||
10 | for kfold in {1..4} | ||
11 | do | ||
12 | for k in {1..100} | ||
13 | do | ||
14 | rm ${EXP_DIR}/$kfold/$k/clustered_$k.txt | ||
15 | done | ||
16 | done | ||
17 | 1 |
run-clustering.sh
1 | # | 1 | # |
2 | # This script aims to compute clustering | 2 | # This script aims to compute clustering |
3 | # | 3 | # |
4 | 4 | ||
5 | 5 | ||
6 | # -- CONFIGURATION | 6 | # -- CONFIGURATION |
7 | # THIS SCRIPT NEEDS THESE VARIABLES | 7 | # THIS SCRIPT NEEDS THESE VARIABLES |
8 | # Vector file | 8 | # Vector file |
9 | #VECTOR_FILE="" | 9 | #VECTOR_FILE="" |
10 | # Train list | 10 | # Train list |
11 | #TRAIN_LST=="" | 11 | #TRAIN_LST=="" |
12 | # Val list | 12 | # Val list |
13 | #VAL_LST="" | 13 | #VAL_LST="" |
14 | # Exp directory | 14 | # Exp directory |
15 | #EXP_DIR="" | 15 | #EXP_DIR="" |
16 | # Metas file with type values | 16 | # Metas file with type values |
17 | #METAS_TYPE="" | 17 | #METAS_TYPE="" |
18 | # Metas file with character values | 18 | # Metas file with character values |
19 | #METAS_CHARACTER="" | 19 | #METAS_CHARACTER="" |
20 | 20 | ||
21 | 21 | ||
22 | #echo "VECTOR FILE: $VECTOR_FILE" | 22 | #echo "VECTOR FILE: $VECTOR_FILE" |
23 | #echo "TRAIN LIST: $TRAIN_LST" | 23 | #echo "TRAIN LIST: $TRAIN_LST" |
24 | #echo "VAL LIST: $VAL_LST" | 24 | #echo "VAL LIST: $VAL_LST" |
25 | #echo "EXP DIR: $EXP_DIR" | 25 | #echo "EXP DIR: $EXP_DIR" |
26 | #echo "METAS TYPE: $METAS_TYPE" | 26 | #echo "METAS TYPE: $METAS_TYPE" |
27 | #echo "METAS_CHARACTER: $METAS_CHARACTER" | 27 | #echo "METAS_CHARACTER: $METAS_CHARACTER" |
28 | 28 | ||
29 | 29 | ||
30 | 30 | ||
31 | # -- TRAIN KMEANS | 31 | # -- TRAIN KMEANS |
32 | echo "Clustering - ${kfold}" | 32 | echo "Clustering - ${kfold}"sss |
33 | python3 bin/cluster_kmeans.py "${VECTOR_FILE}" \ | 33 | python3 bin/cluster_kmeans.py "${VECTOR_FILE}" \ |
34 | "${TRAIN_LST}" \ | 34 | "${TRAIN_LST}" \ |
35 | "${EXP_DIR}" --kmin ${KMIN} --kmax ${KMAX} | 35 | "${EXP_DIR}" --kmin ${KMIN} --kmax ${KMAX} |
36 | 36 | ||
37 | 37 | ||
38 | 38 | ||
39 | for k in $(seq ${KMIN} 1 ${KMAX}) | 39 | for k in $(seq ${KMIN} 1 ${KMAX}) |
40 | do | 40 | do |
41 | SUB_EXP_DIR="${EXP_DIR}/${k}" | 41 | SUB_EXP_DIR="${EXP_DIR}/${k}" |
42 | 42 | ||
43 | # -- EXTRACT KMEANS VALUES | 43 | # -- EXTRACT KMEANS VALUES |
44 | echo "Kmeans Measuring and extraction - ${k}" | 44 | echo "Kmeans Measuring and extraction - ${k}" |
45 | python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ | 45 | python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ |
46 | "${VECTOR_FILE}" \ | 46 | "${VECTOR_FILE}" \ |
47 | --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" | 47 | --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" |
48 | # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR | 48 | # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR |
49 | # Measures | 49 | # Measures |
50 | python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | 50 | python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ |
51 | "${METAS_CHARACTER}" \ | 51 | "${METAS_CHARACTER}" \ |
52 | "${TRAIN_LST}" \ | 52 | "${TRAIN_LST}" \ |
53 | "${VAL_LST}" \ | 53 | "${VAL_LST}" \ |
54 | --outfile "${SUB_EXP_DIR}/measures.json" | 54 | --outfile "${SUB_EXP_DIR}/measures.json" |
55 | 55 | ||
56 | # Plot count matrix for train | 56 | # Plot count matrix for train |
57 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | 57 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ |
58 | ${VECTOR_FILE} \ | 58 | ${VECTOR_FILE} \ |
59 | ${TRAIN_LST} \ | 59 | ${TRAIN_LST} \ |
60 | --outfile "${SUB_EXP_DIR}/train_count_matrix.pdf" | 60 | --outfile "${SUB_EXP_DIR}/train_count_matrix.pdf" |
61 | 61 | ||
62 | # Plot count matrix for val | 62 | # Plot count matrix for val |
63 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | 63 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ |
64 | ${VECTOR_FILE} \ | 64 | ${VECTOR_FILE} \ |
65 | ${VAL_LST} \ | 65 | ${VAL_LST} \ |
66 | --outfile "${SUB_EXP_DIR}/val_count_matrix.pdf" | 66 | --outfile "${SUB_EXP_DIR}/val_count_matrix.pdf" |
67 | 67 | ||
68 | # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR | 68 | # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR |
69 | # Measures | 69 | # Measures |
70 | python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | 70 | python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ |
71 | "${METAS_TYPE}" \ | 71 | "${METAS_TYPE}" \ |
72 | "${TRAIN_LST}" \ | 72 | "${TRAIN_LST}" \ |
73 | "${VAL_LST}" \ | 73 | "${VAL_LST}" \ |
74 | --outfile "${SUB_EXP_DIR}/measures_type.json" | 74 | --outfile "${SUB_EXP_DIR}/measures_type.json" |
75 | 75 | ||
76 | # This script plot the count matrix of the train set | 76 | # This script plot the count matrix of the train set |
77 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | 77 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ |
78 | "${METAS_TYPE}" \ | 78 | "${METAS_TYPE}" \ |
79 | "${TRAIN_LST}" \ | 79 | "${TRAIN_LST}" \ |
80 | --outfile "${SUB_EXP_DIR}/train_count_matrix_type.pdf" | 80 | --outfile "${SUB_EXP_DIR}/train_count_matrix_type.pdf" |
81 | 81 | ||
82 | # This script plot the count matrix of the validation set | 82 | # This script plot the count matrix of the validation set |
83 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | 83 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ |
84 | "${METAS_TYPE}" \ | 84 | "${METAS_TYPE}" \ |
85 | "${VAL_LST}" \ | 85 | "${VAL_LST}" \ |
86 | --outfile "${SUB_EXP_DIR}/val_count_matrix_type.pdf" | 86 | --outfile "${SUB_EXP_DIR}/val_count_matrix_type.pdf" |
87 | 87 | ||
88 | 88 | ||
89 | # -- MEASURES AND PLOT WITH RESPECT TO LANG VAR | 89 | # -- MEASURES AND PLOT WITH RESPECT TO LANG VAR |
90 | # Measures | 90 | # Measures |
91 | python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | 91 | python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ |
92 | "${METAS_LANG}" \ | 92 | "${METAS_LANG}" \ |
93 | "${TRAIN_LST}" \ | 93 | "${TRAIN_LST}" \ |
94 | "${VAL_LST}" \ | 94 | "${VAL_LST}" \ |
95 | --outfile "${SUB_EXP_DIR}/measures_lang.json" | 95 | --outfile "${SUB_EXP_DIR}/measures_lang.json" |
96 | 96 | ||
97 | # This script plot the count matrix of the train set | 97 | # This script plot the count matrix of the train set |
98 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | 98 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ |
99 | "${METAS_LANG}" \ | 99 | "${METAS_LANG}" \ |
100 | "${TRAIN_LST}" \ | 100 | "${TRAIN_LST}" \ |
101 | --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf" | 101 | --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf" |
102 | 102 | ||
103 | # This script plot the count matrix of the validation set | 103 | # This script plot the count matrix of the validation set |
104 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | 104 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ |
105 | "${METAS_LANG}" \ | 105 | "${METAS_LANG}" \ |
106 | "${VAL_LST}" \ | 106 | "${VAL_LST}" \ |
107 | --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf" | 107 | --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf" |
108 | 108 | ||
109 | done | 109 | done |
110 | 110 | ||
111 | 111 |
run-measures.sh
1 | # Pour le moment, le run ne fait qu'executer | 1 | # Pour le moment, le run ne fait qu'executer |
2 | # quelques petites commandes que l'on souhaite | 2 | # quelques petites commandes que l'on souhaite |
3 | # tester. | 3 | # tester. |
4 | 4 | ||
5 | OUTDIR="exp/kmeans_euclidian/teacher-pvector-1" | 5 | set -e |
6 | |||
7 | OUTDIR="exp/kmeans_euclidian/ivectors" | ||
6 | EXP_DIR=${OUTDIR} | 8 | EXP_DIR=${OUTDIR} |
7 | DATADIR="data" | 9 | DATADIR="data" |
8 | NEW_LSTDIR="${OUTDIR}/lst" | 10 | NEW_LSTDIR="${OUTDIR}/lst" |
9 | 11 | ||
10 | kmin=2 | 12 | kmin=2 |
11 | kmax=100 | 13 | kmax=100 |
12 | 14 | ||
13 | if [ ! -d "$OUTDIR" ]; | 15 | if [ ! -d "$OUTDIR" ]; |
14 | then | 16 | then |
15 | mkdir -p $OUTDIR | 17 | mkdir -p $OUTDIR |
16 | fi | 18 | fi |
17 | 19 | ||
18 | if [ ! -d "$NEW_LSTDIR" ]; | 20 | if [ ! -d "$NEW_LSTDIR" ]; |
19 | then | 21 | then |
20 | mkdir -p $NEW_LSTDIR | 22 | mkdir -p $NEW_LSTDIR |
21 | fi | 23 | fi |
22 | 24 | ||
23 | for kfold in {1..4} | 25 | for kfold in {1..4} |
24 | do | 26 | do |
25 | pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" | 27 | #pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" |
26 | VECTOR_FILE=$pvector_file | 28 | VECTOR_FILE="${DATADIR}/ivectors.txt" |
27 | lst_dir="${DATADIR}/pvectors_1rst/lst" | 29 | lst_dir="${DATADIR}/pvectors_1rst/lst" |
28 | output_kfold="${OUTDIR}/${kfold}" | 30 | output_kfold="${OUTDIR}/${kfold}" |
29 | 31 | ||
30 | #python3 "bin/replace_label.py" \ | 32 | #python3 "bin/replace_label.py" \ |
31 | # "${DATADIR}/masseffect.lst" \ | 33 | # "${DATADIR}/masseffect.lst" \ |
32 | # "${DATADIR}/character_information.csv" \ | 34 | # "${DATADIR}/character_information.csv" \ |
33 | # --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \ | 35 | # --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \ |
34 | # --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst" | 36 | # --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst" |
35 | 37 | ||
36 | #python3 "bin/replace_label.py" \ | 38 | #python3 "bin/replace_label.py" \ |
37 | # "${DATADIR}/masseffect.lst" \ | 39 | # "${DATADIR}/masseffect.lst" \ |
38 | # "${DATADIR}/character_information.csv" \ | 40 | # "${DATADIR}/character_information.csv" \ |
39 | # --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \ | 41 | # --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \ |
40 | # --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst" | 42 | # --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst" |
41 | 43 | ||
42 | #cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst" | 44 | #cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst" |
43 | TRAIN_LST=${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst | 45 | TRAIN_LST=${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst |
44 | VAL_LST=${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst | 46 | VAL_LST=${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst |
45 | TRAIN_LANG_LST=${NEW_LSTDIR}/train_${kfold}_lang.lst | 47 | TRAIN_LANG_LST=${NEW_LSTDIR}/train_${kfold}_lang.lst |
46 | VAL_LANG_LST=${NEW_LSTDIR}/val_${kfold}_lang.lst | 48 | VAL_LANG_LST=${NEW_LSTDIR}/val_${kfold}_lang.lst |
47 | METAS_LANG=${NEW_LSTDIR}/metas_${kfold}_lang.lst | 49 | METAS_LANG=${NEW_LSTDIR}/metas_${kfold}_lang.lst |
48 | 50 | ||
49 | # EXTRACT LANGUAGE INFORMATION | 51 | # EXTRACT LANGUAGE INFORMATION |
50 | awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST} | 52 | awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST} |
51 | echo "VAL EXTRACT LANGUAGE INFO DONE" | 53 | echo "VAL EXTRACT LANGUAGE INFO DONE" |
52 | awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST} | 54 | awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST} |
53 | echo "TRAIN EXTRACT LANGUAGE INFO DONE" | 55 | echo "TRAIN EXTRACT LANGUAGE INFO DONE" |
54 | cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}" | 56 | cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}" |
55 | echo "GLOBAL EXTRACT LANGUAGE INFO DONE" | 57 | echo "GLOBAL EXTRACT LANGUAGE INFO DONE" |
56 | 58 | ||
57 | 59 | ||
58 | echo "Clustering - ${kfold}" | 60 | echo "Clustering - ${kfold}" |
59 | 61 | ||
60 | for k in $(seq ${kmin} 1 ${kmax}) | 62 | for k in $(seq ${kmin} 1 ${kmax}) |
61 | do | 63 | do |
62 | echo "Kmeans Measuring and ploting - ${k}" | 64 | echo "Kmeans Measuring and ploting - ${k}" |
63 | 65 | ||
64 | SUB_EXP_DIR="${EXP_DIR}/${kfold}/${k}" | 66 | SUB_EXP_DIR="${EXP_DIR}/${kfold}/${k}" |
65 | 67 | ||
66 | # -- EXTRACT CLUSTERING LABELS | 68 | # -- EXTRACT CLUSTERING LABELS |
67 | python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ | 69 | python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ |
68 | "${VECTOR_FILE}" \ | 70 | "${VECTOR_FILE}" \ |
69 | --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" | 71 | --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" |
70 | 72 | ||
71 | # -- MEASURES AND PLOT WITH RESPECT TO LANG VAR | 73 | # -- MEASURES AND PLOT |
72 | # Measures | 74 | source steps/measure_clustering_char.sh |
73 | python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | 75 | source steps/measure_clustering_type.sh |
74 | "${METAS_LANG}" \ | 76 | source steps/measure_clustering_lang.sh |
75 | "${TRAIN_LST}" \ | ||
76 | "${VAL_LST}" \ | ||
77 | --outfile "${SUB_EXP_DIR}/measures_lang.json" | ||
78 | 77 | ||
79 | # This script plot the count matrix of the train set | 78 | rm ${SUB_EXP_DIR}/clustered_${k}.txt |
80 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | ||
81 | "${METAS_LANG}" \ | ||
82 | "${TRAIN_LST}" \ | ||
83 | --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf" | ||
84 | |||
85 | # This script plot the count matrix of the validation set | ||
86 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | ||
87 | "${METAS_LANG}" \ | ||
88 | "${VAL_LST}" \ | ||
89 | --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf" | ||
90 | |||
91 | rm ${SUB_EXP_DIR}/clustered_${k}.txt | ||
92 | #python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \ | ||
93 | # "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \ | ||
94 | # "${lst_dir}/val_${kfold}.lst" \ | ||
95 | # --outfile "${output_kfold}/${k}/measures_type.json" | ||
96 | |||
97 | # This script plot the count matrix of the train set | ||
98 | #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | ||
99 | # ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \ | ||
100 | # --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf | ||
101 | |||
102 | # This script plot the count matrix of the validation set | ||
103 | #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | ||
104 | # ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \ | ||
105 | # --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf | ||
106 | |||
107 | # This script plot the count matrix of the train set | ||
108 | #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | ||
109 | # ${pvector_file} ${lst_dir}/train_${kfold}.lst \ | ||
110 | # --outfile ${output_kfold}/${k}/train_count_matrix.pdf | ||
111 | |||
112 | # This script plot the count matrix of the validation set | ||
113 | #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \ | ||
114 | # ${pvector_file} ${lst_dir}/val_${kfold}.lst \ | ||
115 | # --outfile ${output_kfold}/${k}/val_count_matrix.pdf | ||
116 | done | 79 | done |
117 | done | 80 | done |
run-skyrim.sh
File was created | 1 | python bin/cluster_kmeans.py ../data/skyrim/skyrim_ivectors.txt ../data/skyrim/skyrim.lst exp/kmeans_euclidian_skyrim/ivectors/ --kmin 1 --kmax 100 | |
2 |
run.sh
1 | 1 | ||
2 | #OUTDIR="exp/test/pvector-2" | 2 | #OUTDIR="exp/test/pvector-2" |
3 | #DATADIR="data" | 3 | #DATADIR="data" |
4 | #NEW_LSTDIR="${OUTDIR}/lst" | 4 | #NEW_LSTDIR="${OUTDIR}/lst" |
5 | 5 | ||
6 | #VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" | 6 | #VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" |
7 | #VECTOR_FILES_END=".txt" | 7 | #VECTOR_FILES_END=".txt" |
8 | #VECTOR_FILE="" # To specify if there's only one | 8 | #VECTOR_FILE="" # To specify if there's only one |
9 | #VECTOR_FILES_ONE=false # Specify there's only one file | 9 | #VECTOR_FILES_ONE=false # Specify there's only one file |
10 | 10 | ||
11 | #KMIN=2 | 11 | #KMIN=2 |
12 | #KMAX=100 | 12 | #KMAX=100 |
13 | 13 | ||
14 | # -- LOAD CONFIG FILE | 14 | # -- LOAD CONFIG FILE |
15 | CONFIG_FILE="config.sh" | 15 | CONFIG_FILE="config.sh" |
16 | 16 | ||
17 | if [ $# -eq 1 ] | 17 | if [ $# -eq 1 ] |
18 | then | 18 | then |
19 | CONFIG_FILE="$1" | 19 | CONFIG_FILE="$1" |
20 | else | 20 | else |
21 | echo "Need to have one and only one argument" | 21 | echo "Need to have one and only one argument" |
22 | exit -1 | 22 | exit -1 |
23 | fi | 23 | fi |
24 | 24 | ||
25 | source $CONFIG_FILE | 25 | source $CONFIG_FILE |
26 | 26 | ||
27 | # -- DEFAULTS VALUES CONFIGURATION | 27 | # -- DEFAULTS VALUES CONFIGURATION |
28 | if [ -z "$VECTOR_FILES_ONE" ] | 28 | if [ -z "$VECTOR_FILES_ONE" ] |
29 | then | 29 | then |
30 | VECTOR_FILES_ONE=false | 30 | VECTOR_FILES_ONE=false |
31 | fi | 31 | fi |
32 | 32 | ||
33 | 33 | ||
34 | if [ -z "$METAS_CHARACTER" ] | ||
35 | then | ||
36 | METAS_CHARACTER="${DATADIR}/masseffect.lst" | ||
37 | fi | ||
34 | 38 | ||
39 | |||
40 | if [ -z "$CHAR_INFO" ] | ||
41 | then | ||
42 | CHAR_INFO="${DATADIR}/character_information.csv" | ||
43 | fi | ||
44 | |||
35 | # -- MAKE DIRECTORIES | 45 | # -- MAKE DIRECTORIES |
36 | if [ ! -d "$OUTDIR" ]; | 46 | if [ ! -d "$OUTDIR" ]; |
37 | then | 47 | then |
38 | mkdir -p $OUTDIR | 48 | mkdir -p $OUTDIR |
39 | fi | 49 | fi |
40 | 50 | ||
41 | if [ ! -d "${NEW_LSTDIR}" ]; | 51 | if [ ! -d "${NEW_LSTDIR}" ]; |
42 | then | 52 | then |
43 | mkdir -p ${NEW_LSTDIR} | 53 | mkdir -p ${NEW_LSTDIR} |
44 | fi | 54 | fi |
45 | 55 | ||
46 | 56 | ||
47 | # -- KFOLD MIN and MAX | 57 | # -- KFOLD MIN and MAX |
48 | if [ -z "$MIN_KFOLD" ] | 58 | if [ -z "$MIN_KFOLD" ] |
49 | then | 59 | then |
50 | MIN_KFOLD=1 | 60 | MIN_KFOLD=1 |
51 | fi | 61 | fi |
52 | 62 | ||
53 | if [ -z "$MAX_KFOLD" ] | 63 | if [ -z "$MAX_KFOLD" ] |
54 | then | 64 | then |
55 | MAX_KFOLD=4 | 65 | MAX_KFOLD=4 |
56 | fi | 66 | fi |
57 | 67 | ||
58 | # -- BEGIN BY KFOLD | 68 | # -- BEGIN BY KFOLD |
59 | for kfold in $(seq ${MIN_KFOLD} ${MAX_KFOLD}) | 69 | for kfold in $(seq ${MIN_KFOLD} ${MAX_KFOLD}) |
60 | do | 70 | do |
61 | # Some usefull variable | 71 | # Some usefull variable |
62 | CHAR_INFO="${DATADIR}/character_information.csv" | ||
63 | TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst" | 72 | TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst" |
64 | VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst" | 73 | VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst" |
65 | TRAIN_LANG_LST="${NEW_LSTDIR}/train_${kfold}_lang.lst" | 74 | TRAIN_LANG_LST="${NEW_LSTDIR}/train_${kfold}_lang.lst" |
66 | VAL_LANG_LST="${NEW_LSTDIR}/val_${kfold}_lang.lst" | 75 | VAL_LANG_LST="${NEW_LSTDIR}/val_${kfold}_lang.lst" |
67 | 76 | ||
68 | # Configuration for the run clustering file | 77 | # Configuration for the run clustering file |
69 | if [ ${VECTOR_FILES_ONE} == false ] | 78 | if [ ${VECTOR_FILES_ONE} == false ] |
70 | then | 79 | then |
71 | VECTOR_FILE="${VECTOR_FILES_BEGIN}_${kfold}${VECTOR_FILES_END}" | 80 | VECTOR_FILE="${VECTOR_FILES_BEGIN}_${kfold}${VECTOR_FILES_END}" |
72 | fi | 81 | fi |
73 | 82 | ||
74 | TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst" | 83 | TRAIN_LST="${MOTHER_LST_DIR}/lst/train_${kfold}.lst" |
75 | VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst" | 84 | VAL_LST="${MOTHER_LST_DIR}/lst/val_${kfold}.lst" |
76 | EXP_DIR="${OUTDIR}/${kfold}" | 85 | EXP_DIR="${OUTDIR}/${kfold}" |
77 | METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst" | 86 | METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst" |
78 | METAS_CHARACTER="${DATADIR}/masseffect.lst" | ||
79 | METAS_LANG="${NEW_LSTDIR}/metas_${kfold}_lang.lst" | 87 | METAS_LANG="${NEW_LSTDIR}/metas_${kfold}_lang.lst" |
80 | 88 | ||
81 | 89 | ||
82 | if [ ! -d "${EXP_DIR}" ]; | 90 | if [ ! -d "${EXP_DIR}" ]; |
83 | then | 91 | then |
84 | mkdir -p ${EXP_DIR} | 92 | mkdir -p ${EXP_DIR} |
85 | fi | 93 | fi |
86 | 94 | ||
87 | 95 | ||
88 | # EXTRACT TYPE INFORMATION | 96 | # EXTRACT TYPE INFORMATION |
89 | echo "Extracting character information" | 97 | echo "Extracting character information" |
90 | echo "Replace in train" | 98 | echo "Replace in train" |
91 | python3 "bin/replace_label.py" \ | 99 | python3 "bin/replace_label.py" \ |
92 | "${METAS_CHARACTER}" \ | 100 | "${METAS_CHARACTER}" \ |
93 | "${CHAR_INFO}" \ | 101 | "${CHAR_INFO}" \ |
94 | --field "type" \ | 102 | --field "type" \ |
95 | --lst "${TRAIN_LST}" \ | 103 | --lst "${TRAIN_LST}" \ |
96 | --outfile "${TRAIN_TYPE_LST}" | 104 | --outfile "${TRAIN_TYPE_LST}" |
97 | 105 | ||
98 | echo "Replace in val" | 106 | echo "Replace in val" |
99 | python3 "bin/replace_label.py" \ | 107 | python3 "bin/replace_label.py" \ |
100 | "${METAS_CHARACTER}" \ | 108 | "${METAS_CHARACTER}" \ |
101 | "${CHAR_INFO}" \ | 109 | "${CHAR_INFO}" \ |
102 | --field "type" \ | 110 | --field "type" \ |
103 | --lst "${VAL_LST}" \ | 111 | --lst "${VAL_LST}" \ |
104 | --outfile "${VAL_TYPE_LST}" | 112 | --outfile "${VAL_TYPE_LST}" |
105 | 113 | ||
106 | echo "Merge them" | 114 | echo "Merge them" |
107 | cat "${TRAIN_TYPE_LST}" "${VAL_TYPE_LST}" > "${METAS_TYPE}" | 115 | cat "${TRAIN_TYPE_LST}" "${VAL_TYPE_LST}" > "${METAS_TYPE}" |
108 | 116 | ||
109 | # EXTRACT LANGUAGE INFORMATION | 117 | # EXTRACT LANGUAGE INFORMATION |
110 | echo "Language info for train" | 118 | echo "Language info for train" |
111 | awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST} | 119 | awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST} |
112 | echo "Language info for val" | 120 | echo "Language info for val" |
113 | awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST} | 121 | awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST} |
114 | 122 | ||
115 | echo "Merge them" | 123 | echo "Merge them" |
116 | cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}" | 124 | cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}" |
117 | 125 | ||
118 | echo "Then Run Clustering" | 126 | echo "Then Run Clustering" |
119 | source "run-clustering.sh" | 127 | source "run-clustering.sh" |
120 | done | 128 | done |
121 | 129 | ||
122 | # Regroup measures with respect to character classes | 130 | # Regroup measures with respect to character classes |
123 | echo "Regrouping measures with respect to character classes" | 131 | echo "Regrouping measures with respect to character classes" |
124 | python3 "bin/regroup-measures.py" ${OUTDIR} | 132 | python3 "bin/regroup-measures.py" ${OUTDIR} |
125 | 133 | ||
126 | # Regroup measures with respect to type classes | 134 | # Regroup measures with respect to type classes |
127 | echo "Regrouping measures with respect to type classes" | 135 | echo "Regrouping measures with respect to type classes" |
128 | python3 "bin/regroup-measures.py" ${OUTDIR} --suffix "_type" --measurefile "measures_type.json" | 136 | python3 "bin/regroup-measures.py" ${OUTDIR} --suffix "_type" --measurefile "measures_type.json" |
129 | 137 |
run_kfold.sh
File was created | 1 | ||
2 | for kfold in `seq 1 4` | ||
3 | do | ||
4 | echo "KFOLD: ${kfold}" | ||
5 | source run.sh | ||
6 | done | ||
7 | |||
8 |
run_without_kfold.sh
File was created | 1 | ||
2 | for k in $(seq ${KMIN} 1 ${KMAX}) | ||
3 | do | ||
4 | SUB_EXP_DIR="${EXP_DIR}/${k}" | ||
5 | |||
6 | # -- EXTRACT KMEANS VALUES | ||
7 | echo "Kmeans Measuring and extraction - ${k}" | ||
8 | python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ | ||
9 | "${VECTOR_FILE}" \ | ||
10 | --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" | ||
11 | |||
12 | python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | ||
13 | "${METAS_CHARACTER}" \ | ||
14 | "${TRAIN_LST}" \ | ||
15 | "${VAL_LST}" \ | ||
16 | --outfile "${SUB_EXP_DIR}/measures.json" |
steps/extract_cluster_file.sh
File was created | 1 | ||
2 | for kfold in `seq 1 4` | ||
3 | do | ||
4 | source $1 | ||
5 | vector_file=${VECTOR_FILE} | ||
6 | echo "kfold: $kfold" | ||
7 | for kmean in `seq 2 100` | ||
8 | do | ||
9 | echo "kmean: $kmean" | ||
10 | exp_dir="${OUTDIR}/${kfold}/${kmean}" | ||
11 | clustering="${exp_dir}/clustering_${kmean}.pkl" | ||
12 | save_loc="${exp_dir}" | ||
13 | saved_txt="${save_loc}/masseffect_clustered.txt" | ||
14 | saved_lst="${save_loc}/masseffect_clustered.lst" | ||
15 | |||
16 | python3 bin/extract_kmeans.py "${clustering}" \ | ||
17 | "${vector_file}" \ | ||
18 | --outfile "${saved_txt}" | ||
19 | |||
20 | cat ${saved_txt} | cut -d" " -f1 > ${saved_lst} | ||
21 | |||
22 | python3 bin/replace-features.py "${ORIGINAL_VECTOR_FILE}" "${saved_txt}" | ||
23 | done | ||
24 | done | ||
25 |
steps/extract_cluster_file_skyrim.sh
File was created | 1 | ||
2 | source $1 | ||
3 | vector_file=${VECTOR_FILE} | ||
4 | echo "kfold: $kfold" | ||
5 | for kmean in `seq 2 100` | ||
6 | do | ||
7 | echo "kmean: $kmean" | ||
8 | exp_dir="${OUTDIR}/${kmean}" | ||
9 | clustering="${exp_dir}/clustering_${kmean}.pkl" | ||
10 | save_loc="${exp_dir}" | ||
11 | saved_txt="${save_loc}/masseffect_clustered.txt" | ||
12 | saved_lst="${save_loc}/masseffect_clustered.lst" | ||
13 | |||
14 | python3 bin/extract_kmeans.py "${clustering}" \ | ||
15 | "${vector_file}" \ | ||
16 | --outfile "${saved_txt}" | ||
17 | |||
18 | cat ${saved_txt} | cut -d" " -f1 > ${saved_lst} | ||
19 | |||
20 | python3 bin/replace-features.py "${ORIGINAL_VECTOR_FILE}" "${saved_txt}" | ||
21 | done | ||
22 | |||
23 |
steps/extract_language_lst.sh
File was created | 1 | DATADIR="data" | |
2 | OUTDIR="exp/kmeans_euclidian/ivectors" | ||
3 | NEW_LSTDIR="${OUTDIR}/lst" | ||
4 | |||
5 | TRAIN_LST=${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst | ||
6 | VAL_LST=${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst | ||
7 | TRAIN_LANG_LST=${NEW_LSTDIR}/train_${kfold}_lang.lst | ||
8 | VAL_LANG_LST=${NEW_LSTDIR}/val_${kfold}_lang.lst | ||
9 | METAS_LANG=${NEW_LSTDIR}/metas_${kfold}_lang.lst | ||
10 | |||
11 | |||
12 | awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST} | ||
13 | echo "VAL EXTRACT LANGUAGE INFO DONE" | ||
14 | awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST} | ||
15 | echo "TRAIN EXTRACT LANGUAGE INFO DONE" | ||
16 | cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}" | ||
17 | echo "GLOBAL EXTRACT LANGUAGE INFO DONE" |
steps/measure_clustering_char.sh
File was created | 1 | ||
2 | python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \ | ||
3 | "${lst_dir}/trainval_${kfold}.lst" "${lst_dir}/train_${kfold}.lst" \ | ||
4 | "${lst_dir}/val_${kfold}.lst" \ | ||
5 | --outfile "${output_kfold}/${k}/measures.json" | ||
6 | |||
7 | |||
8 | # This script plot the count matrix of the train set | ||
9 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | ||
10 | "${lst_dir}/train_${kfold}.lst" \ | ||
11 | "${lst_dir}/train_${kfold}.lst" \ | ||
12 | --outfile "${SUB_EXP_DIR}/train_count_matrix.pdf" | ||
13 | |||
14 | # This script plot the count matrix of the validation set | ||
15 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | ||
16 | "${lst_dir}/val_${kfold}.lst" \ | ||
17 | "${lst_dir}/val_${kfold}.lst" \ | ||
18 | --outfile "${SUB_EXP_DIR}/val_count_matrix.pdf" | ||
19 |
steps/measure_clustering_lang.sh
File was created | 1 | ||
2 | python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | ||
3 | "${METAS_LANG}" \ | ||
4 | "${TRAIN_LST}" \ | ||
5 | "${VAL_LST}" \ | ||
6 | --outfile "${SUB_EXP_DIR}/measures_lang.json" | ||
7 | |||
8 | # This script plot the count matrix of the train set | ||
9 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | ||
10 | "${METAS_LANG}" \ | ||
11 | "${TRAIN_LST}" \ | ||
12 | --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf" | ||
13 | |||
14 | # This script plot the count matrix of the validation set | ||
15 | python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ | ||
16 | "${METAS_LANG}" \ | ||
17 | "${VAL_LST}" \ | ||
18 | --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf" |
steps/measure_clustering_type.sh
File was created | 1 | python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \ | |
2 | "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \ | ||
3 | "${lst_dir}/val_${kfold}.lst" \ | ||
4 | --outfile "${output_kfold}/${k}/measures_type.json" | ||
5 | |||
6 | # This script plot the count matrix of the train set | ||
7 | python3 bin/plot-count-matrix.py "${output_kfold}/${k}/clustered_${k}.txt" \ | ||
8 | "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \ | ||
9 | --outfile "${output_kfold}/${k}/train_count_matrix_type.pdf" | ||
10 | |||
11 | # This script plot the count matrix of the validation set | ||
12 | python3 bin/plot-count-matrix.py "${output_kfold}/${k}/clustered_${k}.txt" \ | ||
13 | "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/val_${kfold}.lst" \ | ||
14 | --outfile "${output_kfold}/${k}/val_count_matrix_type.pdf" | ||
15 |
steps/save_clusters_file.sh
File was created | 1 | ||
2 | vector_file="data/xvectors.txt" | ||
3 | |||
4 | for kfold in `seq 1 4` | ||
5 | do | ||
6 | echo "kfold: $kfold" | ||
7 | for kmean in `seq 2 100` | ||
8 | do | ||
9 | echo "kmean: $kmean" | ||
10 | exp_dir="exp/kmeans_euclidian/xvectors/${kfold}/${kmean}" | ||
11 | clustering="${exp_dir}/clustering_${kmean}.pkl" | ||
12 | save_loc="data/xvectors/saved_clustered/" | ||
13 | saved_txt="${save_loc}/masseffect_clustered_xvectors_${kfold}_${kmean}.txt" | ||
14 | saved_lst="${save_loc}/masseffect_clustered_xvectors_${kfold}_${kmean}.lst" | ||
15 | |||
16 | python3 bin/extract_kmeans.py "${clustering}" \ | ||
17 | "${vector_file}" \ | ||
18 | --outfile "${saved_txt}" | ||
19 | |||
20 | cat ${saved_txt} | cut -d" " -f1 > ${saved_lst} | ||
21 | done | ||
22 | done | ||
23 |
utils/extract-labels.sh
File was created | 1 | ||
2 | |||
3 | # Number of set | ||
4 | k=4 | ||
5 | kmean=88 | ||
6 | |||
7 | |||
8 | # Vector features file | ||
9 | VECTOR_FILE_MASSEFFECT="data/xvectors.txt" | ||
10 | |||
11 | |||
12 | # Dirs | ||
13 | EXP_DIR="exp/kmeans_euclidian/xvectors/${k}/${kmean}" | ||
14 | CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl" | ||
15 | |||
16 | |||
17 | # Output dirs | ||
18 | OUTFILE_MASSEFFECT="data/xvectors/saved_clustered/masseffect_clustered_xvectors_${k}_${kmean}.txt" | ||
19 | |||
20 | python3 bin/extract_kmeans.py "${CLUSTERING}" \ | ||
21 | "${VECTOR_FILE_MASSEFFECT}" \ | ||
22 | --outfile "$OUTFILE_MASSEFFECT" | ||
23 |
utils/rm-unused-files.sh
File was created | 1 | ||
2 | if [ $# -eq 1 ] | ||
3 | then | ||
4 | EXP_DIR="$1" | ||
5 | else | ||
6 | echo "Need to have one and only one argument. This argument is the exp directory." | ||
7 | exit 1 | ||
8 | fi | ||
9 | |||
10 | for kfold in {1..4} | ||
11 | do | ||
12 | for k in {1..100} | ||
13 | do | ||
14 | rm ${EXP_DIR}/$kfold/$k/clustered_$k.txt | ||
15 | done | ||
16 | done | ||
17 |
utils/transform_exp_to_kd.sh
File was created | 1 | ||
2 | # -- DESCRIPTION -- | ||
3 | # | ||
4 | # This script aims to transform data in a shape that is | ||
5 | # usable mainly by knowledge distillation scripts. | ||
6 | # | ||
7 | # Firstly, it extracts clustering labels | ||
8 | # then change features with the given one | ||
9 | # and finally generate a list file. | ||
10 | # | ||
11 | # The pair features files and list file will be usable | ||
12 | # by the knowledge distillation system. | ||
13 | # -------------------- | ||
14 | |||
15 | |||
16 | # -- CONFIGURATION -- | ||
17 | # Configuration error | ||
18 | set -e | ||
19 | |||
20 | # KFOLD config | ||
21 | MIN_KFOLD=1 | ||
22 | MAX_KFOLD=4 | ||
23 | |||
24 | # KMEAN config | ||
25 | MIN_KMEAN=2 | ||
26 | MAX_KMEAN=100 | ||
27 | |||
28 | # Vector features file | ||
29 | DATADIR="data" | ||
30 | FEATURES_DIR="${DATADIR}/pv_from_xv" | ||
31 | FEATURES_PREFIX="me_pv_teacher" | ||
32 | FEATURES_SUFFIX=".txt" | ||
33 | |||
34 | EXP_DIR="exp/kmeans_euclidian/pv_from_xv" | ||
35 | VECTOR_FILE_MASSEFFECT="${DATADIR}/xvectors.txt" | ||
36 | OUTDIR="data/pv_from_xv/saved_clustered" | ||
37 | |||
38 | # -- CREATE DIRECTORIES | ||
39 | # OUTPUT DIRECTORY | ||
40 | if [ ! -d "${OUTDIR}" ] | ||
41 | then | ||
42 | mkdir -p ${OUTDIR} | ||
43 | fi | ||
44 | |||
45 | |||
46 | # -- FUNCTIONS -- | ||
47 | # Definition of the transform function | ||
48 | function transform() { | ||
49 | # Define subdir variable | ||
50 | local SUB_EXP_DIR="${EXP_DIR}/${k}/${kmean}" | ||
51 | |||
52 | # Define features file variable | ||
53 | local INITIAL_VECTOR_FILE="${FEATURES_DIR}/${FEATURES_PREFIX}_${k}${FEATURES_SUFFIX}" | ||
54 | |||
55 | # Information of the current process | ||
56 | echo "[KFOLD, KMEAN]: [${k}, ${kmean}]" | ||
57 | |||
58 | # Define clustering model variable | ||
59 | local CLUSTERING="${SUB_EXP_DIR}/clustering_${kmean}.pkl" | ||
60 | |||
61 | |||
62 | # Define output file | ||
63 | local OUTFILE_MASSEFFECT="${OUTDIR}/masseffect_clustered_${k}_${kmean}.txt" | ||
64 | |||
65 | # Extracting clustering labels | ||
66 | echo "Extracting clustering labels" | ||
67 | python3 bin/extract_kmeans.py "${CLUSTERING}" \ | ||
68 | "${INITIAL_VECTOR_FILE}" \ | ||
69 | --outfile "${OUTFILE_MASSEFFECT}" | ||
70 | |||
71 | # Changing features | ||
72 | echo "Changing features" | ||
73 | python bin/replace-features.py ${VECTOR_FILE_MASSEFFECT} ${OUTFILE_MASSEFFECT} | ||
74 | |||
75 | # Extracting list file | ||
76 | cut -d' ' -f1 ${OUTFILE_MASSEFFECT} > "${OUTDIR}/masseffect_clustered_${k}_${kmean}.lst" | ||
77 | } | ||
78 | |||
79 | |||
80 | # -- MAIN LOOPS | ||
81 | for k in $(seq ${MIN_KFOLD} ${MAX_KFOLD}) | ||
82 | do | ||
83 | for kmean in $(seq ${MIN_KMEAN} ${MAX_KMEAN}) | ||
84 | do | ||
85 | transform | ||
86 | done | ||
87 | done | ||
88 |