Commit e63ab06fc786597258d861e68c335de9e2afceb4

Authored by Mathias Quillot
1 parent c95c2bf75c
Exists in master

New organisation of the project

Showing 44 changed files with 544 additions and 210 deletions Inline Diff

1 # Clustering 1 # Clustering
2 A repository where i put everything dealing with clustering algorithms. 2 A repository where i put everything dealing with clustering algorithms.
3 3
4 # How to use
5 You can run directly the run.sh script if you want. You just need data.
6
7 You can use some scripts in utils tool, but run these scripts from the root directory "clustering/".
8
4 # TODO 9 # TODO
5 - Organiser les différentes listes de données pour mes expériences 10 - Organiser les différentes listes de données pour mes expériences
6 - Create a data file example 11 - Create a data file example
7 12
8 # Data 13 # Data
9 14
10 # File format 15 # File format
11 16
bin/regroup-measures.py
1 ''' 1 '''
2 Regroup results into one file and a plot. 2 Regroup results into one file and a plot.
3 TODO: Mettre en valeur les valeurs maximales 3 TODO: Mettre en valeur les valeurs maximales
4 TODO: Sauvegarder les valeurs quelques part pour qu'on puisse facilement les retrouver. 4 TODO: Sauvegarder les valeurs quelques part pour qu'on puisse facilement les retrouver.
5 5
6 ''' 6 '''
7 7
8 import numpy as np 8 import numpy as np
9 import matplotlib.pyplot as plt 9 import matplotlib.pyplot as plt
10 import argparse 10 import argparse
11 import os 11 import os
12 import json 12 import json
13 13
14 14
15 def plot_values_clusters(values, title, xlabel, ylabel): 15 def plot_values_clusters(values, title, xlabel, ylabel):
16 values = np.asarray(values) 16 values = np.asarray(values)
17 x = np.arange(len(values)) + 2 17 x = np.arange(len(values)) + 2
18 x_ticks = np.arange(len(values), step=10) + 2 18 x_ticks = np.arange(len(values), step=10) + 2
19 y = values 19 y = values
20 plt.scatter(x, y, s=1) 20 plt.scatter(x, y, s=1)
21 plt.xticks(x_ticks) 21 plt.xticks(x_ticks)
22 plt.title(title) 22 plt.title(title)
23 plt.xlabel(xlabel) 23 plt.xlabel(xlabel)
24 plt.ylabel(ylabel) 24 plt.ylabel(ylabel)
25 25
26 26
27 def save_plot(filepath): 27 def save_plot(filepath):
28 plt.savefig(filepath) 28 plt.savefig(filepath)
29 plt.close() 29 plt.close()
30 30
31 31
32 def save_results(outfile, measures, titles): 32 def save_results(outfile, measures, titles):
33 with open(outfile, "w") as f: 33 with open(outfile, "w") as f:
34 f.write(",".join(titles) + "\n") 34 f.write(",".join(titles) + "\n")
35 n = len(measures[0]) 35 n = len(measures[0])
36 for i in range(n): 36 for i in range(n):
37 f.write(",".join([str(ms[i]) for ms in measures]) + "\n") 37 f.write(",".join([str(ms[i]) for ms in measures]) + "\n")
38 38
39 39
40 # -- PARSER 40 # -- PARSER
41 parser = argparse.ArgumentParser(description="") 41 parser = argparse.ArgumentParser(description="")
42 parser.add_argument("expdir", type=str, help="Directory of experiment") 42 parser.add_argument("expdir", type=str, help="Directory of experiment")
43 parser.add_argument("--nkfold", type=int, default=4, help="number of kfold")
44 parser.add_argument("--nkfoldmin", type=int, default=1, help="Begin with this numero of kfold")
43 parser.add_argument("--measurefile", type=str, default="measures.json", 45 parser.add_argument("--measurefile", type=str, default="measures.json",
44 help="Measure file it searchs in folders") 46 help="Measure file it searchs in folders")
45 parser.add_argument("--suffix", type=str, default="", 47 parser.add_argument("--suffix", type=str, default="",
46 help="suffix of saved files") 48 help="suffix of saved files")
47 49
48 args = parser.parse_args() 50 args = parser.parse_args()
49 EXP_DIR = args.expdir 51 EXP_DIR = args.expdir
50 MEASURE_FILE = args.measurefile 52 MEASURE_FILE = args.measurefile
51 SUFFIX = args.suffix 53 SUFFIX = args.suffix
54 MAX_KFOLD = args.nkfold
55 MIN_KFOLD = args.nkfoldmin
52 56
53 # EXP_DIR="exp/kmeans_teacher_1/pvector-1" 57 # EXP_DIR="exp/kmeans_teacher_1/pvector-1"
54 RESULTS_DIR = os.path.join(EXP_DIR, "res") 58 RESULTS_DIR = os.path.join(EXP_DIR, "res")
55 59
56 # -- CONFIG 60 # -- CONFIG
57 kmin = 2 61 kmin = 2
58 kmax = 100 62 kmax = 100
59 63
60 64
61 # -- CREATE FOLDER 65 # -- CREATE FOLDER
62 if not os.path.exists(RESULTS_DIR): 66 if not os.path.exists(RESULTS_DIR):
63 os.makedirs(RESULTS_DIR) 67 os.makedirs(RESULTS_DIR)
64 68
65 # -- BEGIN REGROUPMENT 69 # -- BEGIN REGROUPMENT
66 70
67 subsets = ["train", "val"] 71 subsets = ["train", "val"]
68 72
69 disequilibriums = [] 73 disequilibriums = []
70 74
71 75
72 def init_measures(): 76 def init_measures():
73 measures = {} 77 measures = {}
74 78
75 for subset in subsets: 79 for subset in subsets:
76 measures[subset] = {} 80 measures[subset] = {}
77 measures[subset]["entropy"] = [] 81 measures[subset]["entropy"] = []
78 measures[subset]["vscore"] = [] 82 measures[subset]["vscore"] = []
79 measures[subset]["homogeneity"] = [] 83 measures[subset]["homogeneity"] = []
80 measures[subset]["completeness"] = [] 84 measures[subset]["completeness"] = []
81 return measures 85 return measures
82 86
83 87
84 measures = init_measures() 88 measures = init_measures()
85 89
86 for kfold in range(1, 5): 90 for kfold in range(MIN_KFOLD, MAX_KFOLD + 1):
87 print("Regrouping on kfold: " + str(kfold)) 91 print("Regrouping on kfold: " + str(kfold))
88 # -- REGROUP MEASURES INTO LISTS 92 # -- REGROUP MEASURES INTO LISTS
89 for k in range(kmin, kmax+1): 93 for k in range(kmin, kmax+1):
90 measures_file = os.path.join(EXP_DIR, str(kfold), str(k), MEASURE_FILE) 94 measures_file = os.path.join(EXP_DIR, str(kfold), str(k), MEASURE_FILE)
91 with open(measures_file, 'r') as f: 95 with open(measures_file, 'r') as f:
92 meas_data = json.load(f) 96 meas_data = json.load(f)
93 disequilibriums.append(meas_data["disequilibrium"]) 97 disequilibriums.append(meas_data["disequilibrium"])
94 for subset in subsets: 98 for subset in subsets:
95 measures[subset]["entropy"].append( 99 measures[subset]["entropy"].append(
96 meas_data[subset]["entropy"]) 100 meas_data[subset]["entropy"])
97 measures[subset]["vscore"].append( 101 measures[subset]["vscore"].append(
98 meas_data[subset]["vscore"]) 102 meas_data[subset]["vscore"])
99 measures[subset]["homogeneity"].append( 103 measures[subset]["homogeneity"].append(
100 meas_data[subset]["homogeneity"]) 104 meas_data[subset]["homogeneity"])
101 measures[subset]["completeness"].append( 105 measures[subset]["completeness"].append(
102 meas_data[subset]["completeness"]) 106 meas_data[subset]["completeness"])
103 107
104 # -- PLOT AND SAVE MEASURES FOR A SPECIFIC SUBSET 108 # -- PLOT AND SAVE MEASURES FOR A SPECIFIC SUBSET
105 for subset in subsets: 109 for subset in subsets:
106 # Plot all measures 110 # Plot all measures
107 outf = "measures_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".pdf" 111 outf = "measures_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".pdf"
108 112
109 fig = plt.figure(1) 113 fig = plt.figure(1)
110 for i, measure in enumerate(measures[subset]): 114 for i, measure in enumerate(measures[subset]):
111 115
112 plt.subplot(220 + i + 1) 116 plt.subplot(220 + i + 1)
113 117
114 plot_values_clusters( 118 plot_values_clusters(
115 measures[subset][measure], 119 measures[subset][measure],
116 measure.capitalize() + " " + str(subset) + " set " + str(kfold), 120 measure.capitalize() + " " + str(subset) + " set " + str(kfold),
117 "N clusters", 121 "N clusters",
118 measure.capitalize()) 122 measure.capitalize())
119 plt.subplots_adjust(hspace=0.5, wspace=0.3) 123 plt.subplots_adjust(hspace=0.5, wspace=0.3)
120 save_plot(os.path.join(RESULTS_DIR, outf)) 124 save_plot(os.path.join(RESULTS_DIR, outf))
121 125
122 # Save all measures on a csv file 126 # Save all measures on a csv file
123 save_results( 127 save_results(
124 os.path.join(RESULTS_DIR, "measures_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".csv"), 128 os.path.join(RESULTS_DIR, "measures_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".csv"),
125 [ 129 [
126 measures[subset]["entropy"], 130 measures[subset]["entropy"],
127 measures[subset]["homogeneity"], 131 measures[subset]["homogeneity"],
128 measures[subset]["completeness"], 132 measures[subset]["completeness"],
129 measures[subset]["vscore"] 133 measures[subset]["vscore"]
130 ], 134 ],
131 [ 135 [
132 "entropy", 136 "entropy",
133 "homogeneity", 137 "homogeneity",
134 "completeness", 138 "completeness",
135 "vscore" 139 "vscore"
136 ] 140 ]
137 ) 141 )
138 142
139 # PLOT AND SAVE FOR DISEQUILIBRIUM 143 # PLOT AND SAVE FOR DISEQUILIBRIUM
140 plot_values_clusters( 144 plot_values_clusters(
141 disequilibriums, 145 disequilibriums,
142 "Disequilibrium set " + str(kfold), 146 "Disequilibrium set " + str(kfold),
143 "N clusters", 147 "N clusters",
144 "Disequilibrium") 148 "Disequilibrium")
145 save_plot(os.path.join(RESULTS_DIR, "disequilibrium_" + str(kfold) + str(SUFFIX) + ".pdf")) 149 save_plot(os.path.join(RESULTS_DIR, "disequilibrium_" + str(kfold) + str(SUFFIX) + ".pdf"))
146 150
147 save_results( 151 save_results(
148 os.path.join(RESULTS_DIR, "disequilibrium_" + str(kfold) + str(SUFFIX) + ".csv"), 152 os.path.join(RESULTS_DIR, "disequilibrium_" + str(kfold) + str(SUFFIX) + ".csv"),
149 [disequilibriums], 153 [disequilibriums],
150 ["disequilibrium"]) 154 ["disequilibrium"])
151 155
152 measures = init_measures() 156 measures = init_measures()
153 disequilibriums = [] 157 disequilibriums = []
154 158
bin/replace-features.py
File was created 1
2 import argparse
3
4 from data import read_file, index_by_id, write_line
5
6 # -- ARGPARSE
7 parser = argparse.ArgumentParser(
8 description="Replace features with file from to file to")
9 parser.add_argument("fromfile", type=str, help="From list or features file")
10 parser.add_argument("tofile", type=str, help="Features of 'from' saved into this file.")
11
12 args = parser.parse_args()
13 FROM = args.fromfile
14 TO = args.tofile
15
16
17 # -- READ AND INDEX FILES
18 from_data = read_file(FROM)
19 from_by_id = index_by_id(from_data)
20
21 to_data = read_file(TO)
22
23 with open(TO, "w") as f:
24 for line in to_data:
25 metas = line[0]
26 features = from_by_id[metas[0]][metas[3]][1]
27 write_line(metas, features, f)
28
29
config/archives/ivector_config.sh
File was created 1 OUTDIR="exp/kmeans_euclidian/ivectors"
2 DATADIR="data"
3 NEW_LSTDIR="${OUTDIR}/lst"
4
5 VECTOR_FILE="data/ivectors.txt" # To specify if there's only one
6 VECTOR_FILES_ONE=true # Specify there's only one file
7
8 KMIN=2
9 KMAX=100
10
config/archives/pv_from_xv_config.sh
File was created 1
2 # Framework configuration
3 OUTDIR="exp/kmeans_euclidian/pv_from_xv"
4 DATADIR="data"
5 NEW_LSTDIR="${OUTDIR}/lst"
6
7 VECTOR_FILES_BEGIN="${DATADIR}/pv_from_xv/me_pv_teacher"
8 VECTOR_FILES_END=".txt"
9 VECTOR_FILE="" # To specify if there's only one
10 VECTOR_FILES_ONE=false # Specify there's only one file
11
12 KMIN=2
13 KMAX=100
14
config/archives/pvector_config.sh
File was created 1
2 OUTDIR="exp/kmeans_euclidian/teacher-pvector-1"
3 DATADIR="data"
4 NEW_LSTDIR="${OUTDIR}/lst"
5
6 VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher"
7 VECTOR_FILES_END=".txt"
8 VECTOR_FILE="" # To specify if there's only one
9 VECTOR_FILES_ONE=false # Specify there's only one file
10
11 KMIN=2
12 KMAX=100
13
config/archives/pvector_layer1_config.sh
File was created 1 OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer1"
2 DATADIR="data"
3 NEW_LSTDIR="${OUTDIR}/lst"
4
5 VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_1"
6 VECTOR_FILES_END=".txt"
7 VECTOR_FILE="" # To specify if there's only one
8 VECTOR_FILES_ONE=false # Specify there's only one file
9
10 KMIN=2
11 KMAX=100
12
config/archives/pvector_layer2_config.sh
File was created 1 OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer2"
2 DATADIR="data"
3 NEW_LSTDIR="${OUTDIR}/lst"
4
5 VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_2"
6 VECTOR_FILES_END=".txt"
7 VECTOR_FILE="" # To specify if there's only one
8 VECTOR_FILES_ONE=false # Specify there's only one file
9
10 KMIN=2
11 KMAX=100
12
config/archives/pvector_layer3_config.sh
File was created 1 OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer3"
2 DATADIR="data"
3 NEW_LSTDIR="${OUTDIR}/lst"
4
5 VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_3"
6 VECTOR_FILES_END=".txt"
7 VECTOR_FILE="" # To specify if there's only one
8 VECTOR_FILES_ONE=false # Specify there's only one file
9
10 KMIN=2
11 KMAX=100
12
config/archives/pvector_layer4_config.sh
File was created 1 OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer4"
2 DATADIR="data"
3 NEW_LSTDIR="${OUTDIR}/lst"
4
5 VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_4"
6 VECTOR_FILES_END=".txt"
7 VECTOR_FILE="" # To specify if there's only one
8 VECTOR_FILES_ONE=false # Specify there's only one file
9
10 KMIN=2
11 KMAX=100
12
config/archives/xvector_config.sh
File was created 1 OUTDIR="exp/kmeans_euclidian/xvectors"
2 DATADIR="data"
3 NEW_LSTDIR="${OUTDIR}/lst"
4
5 VECTOR_FILE="data/xvectors.txt" # To specify if there's only one
6 VECTOR_FILES_ONE=true # Specify there's only one file
7
8 KMIN=2
9 KMAX=100
10
File was created 1 OUTDIR="exp/kmeans_euclidian/iv"
2 DATADIR="data"
3 NEW_LSTDIR="${OUTDIR}/lst"
4
5 VECTOR_FILE="data/ivectors.txt" # To specify if there's only one
6 VECTOR_FILES_ONE=true # Specify there's only one file
7
8 METAS_CHARACTER="data/masseffect.lst"
9 CHAR_INFO="data/masseffect_character_information.csv"
10
11 ORIGINAL_VECTOR_FILE="${VECTOR_FILE}"
12
13 KMIN=2
14 KMAX=100
15
16
config/config_iv_skyrim.sh
File was created 1 OUTDIR="exp/kmeans_euclidian_skyrim/iv"
2 DATADIR="data"
3 NEW_LSTDIR="${OUTDIR}/lst"
4
5 VECTOR_FILE="../data/skyrim/skyrim_ivectors.txt" # To specify if there's only one
6 VECTOR_FILES_ONE=true # Specify there's only one file
7
8 METAS_CHARACTER="../data/skyrim/skyrim.lst"
9 CHAR_INFO="data/skyrim_character_information.csv"
10
11 ORIGINAL_VECTOR_FILE="${VECTOR_FILE}"
12
13 KMIN=2
14 KMAX=100
15
16
config/config_pv_from_iv.sh
File was created 1
2 if [ -z "$kfold" ]
3 then
4 kfold=1
5 fi
6
7 if [ -z "${t}" ]
8 then
9 t=2.0
10 fi
11
12 OUTDIR="exp/kmeans_euclidian/pv_from_iv/${kfold}"
13 DATADIR="data"
14 MOTHER_LST_DIR="/local_disk/pegasus/laboinfo/mquillot/vocal_similarity_system/data/prot_alpha"
15 NEW_LSTDIR="${OUTDIR}/lst"
16
17
18 VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/exp/kd_iv/${kfold}/${t}/teacher/masseffect_pvectors.txt" # To specify if there's only one
19 VECTOR_FILES_ONE=true # Specify there's only one file
20 ORIGINAL_VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/data/masseffect.txt"
21
22
23 MIN_KFOLD=${kfold}
24 MAX_KFOLD=${kfold}
25
26 KMIN=2
27 KMAX=100
28
config/config_pv_from_xv.sh
File was created 1
2 if [ -z "$kfold" ]
3 then
4 kfold=1
5 fi
6
7 if [ -z "${t}" ]
8 then
9 t=2.0
10 fi
11
12 OUTDIR="exp/kmeans_euclidian/pv_from_xv/${kfold}"
13 DATADIR="data"
14 MOTHER_LST_DIR="/local_disk/pegasus/laboinfo/mquillot/vocal_similarity_system/data/prot_alpha"
15 NEW_LSTDIR="${OUTDIR}/lst"
16
17
18 VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/exp/kd_xvectors/${kfold}/${t}/teacher/masseffect_pvectors.txt" # To specify if there's only one
19 VECTOR_FILES_ONE=true # Specify there's only one file
20 ORIGINAL_VECTOR_FILE="/local_disk/pegasus/laboinfo/mquillot/knowledge_distillation/data/masseffect_xvectors.txt"
21
22 MIN_KFOLD=${kfold}
23 MAX_KFOLD=${kfold}
24
25 KMIN=2
26 KMAX=100
27
config/config_without_kfold_iv.sh
File was created 1 OUTDIR="exp/kmeans_euclidian_skyrim/ivectors"
2 DATADIR="data"
3 NEW_LSTDIR="${OUTDIR}/lst"
4
5 LST_FILE="/local_disk/pegasus/laboinfo/mquillot/data/skyrim/skyrim_ivectors.txt"
6 VECTOR_FILE="data/ivectors.txt" # To specify if there's only one
7 VECTOR_FILES_ONE=true # Specify there's only one file
8
9 WITHOUT_KFOLD=""
10 KMIN=2
11 KMAX=100
12
13 METAS_CHARACTER=""
File was created 1 OUTDIR="exp/kmeans_euclidian/xv"
2 DATADIR="data"
3 NEW_LSTDIR="${OUTDIR}/lst"
4
5 VECTOR_FILE="data/xvectors.txt" # To specify if there's only one
6 VECTOR_FILES_ONE=true # Specify there's only one file
7
8 ORIGINAL_VECTOR_FILE="${VECTOR_FILE}"
9 KMIN=2
10 KMAX=100
11
config/ivector_config.sh
1 OUTDIR="exp/kmeans_euclidian/ivectors" File was deleted
2 DATADIR="data"
3 NEW_LSTDIR="${OUTDIR}/lst"
4
5 VECTOR_FILE="data/ivectors.txt" # To specify if there's only one
6 VECTOR_FILES_ONE=true # Specify there's only one file
7
8 KMIN=2
9 KMAX=100
10 1 OUTDIR="exp/kmeans_euclidian/ivectors"
config/pv_from_xv_config.sh
1 File was deleted
2 # Framework configuration
3 OUTDIR="exp/kmeans_euclidian/pv_from_xv"
4 DATADIR="data"
5 NEW_LSTDIR="${OUTDIR}/lst"
6
7 VECTOR_FILES_BEGIN="${DATADIR}/pv_from_xv/me_pv_teacher"
8 VECTOR_FILES_END=".txt"
9 VECTOR_FILE="" # To specify if there's only one
10 VECTOR_FILES_ONE=false # Specify there's only one file
11
12 KMIN=2
13 KMAX=100
14 1
config/pvector_config.sh
1 File was deleted
2 OUTDIR="exp/kmeans_euclidian/teacher-pvector-1"
3 DATADIR="data"
4 NEW_LSTDIR="${OUTDIR}/lst"
5
6 VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher"
7 VECTOR_FILES_END=".txt"
8 VECTOR_FILE="" # To specify if there's only one
9 VECTOR_FILES_ONE=false # Specify there's only one file
10
11 KMIN=2
12 KMAX=100
13 1
config/pvector_layer1_config.sh
1 OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer1" File was deleted
2 DATADIR="data"
3 NEW_LSTDIR="${OUTDIR}/lst"
4
5 VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_1"
6 VECTOR_FILES_END=".txt"
7 VECTOR_FILE="" # To specify if there's only one
8 VECTOR_FILES_ONE=false # Specify there's only one file
9
10 KMIN=2
11 KMAX=100
12 1 OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer1"
config/pvector_layer2_config.sh
1 OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer2" File was deleted
2 DATADIR="data"
3 NEW_LSTDIR="${OUTDIR}/lst"
4
5 VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_2"
6 VECTOR_FILES_END=".txt"
7 VECTOR_FILE="" # To specify if there's only one
8 VECTOR_FILES_ONE=false # Specify there's only one file
9
10 KMIN=2
11 KMAX=100
12 1 OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer2"
config/pvector_layer3_config.sh
1 OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer3" File was deleted
2 DATADIR="data"
3 NEW_LSTDIR="${OUTDIR}/lst"
4
5 VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_3"
6 VECTOR_FILES_END=".txt"
7 VECTOR_FILE="" # To specify if there's only one
8 VECTOR_FILES_ONE=false # Specify there's only one file
9
10 KMIN=2
11 KMAX=100
12 1 OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer3"
config/pvector_layer4_config.sh
1 OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer4" File was deleted
2 DATADIR="data"
3 NEW_LSTDIR="${OUTDIR}/lst"
4
5 VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/embedding_activation_4"
6 VECTOR_FILES_END=".txt"
7 VECTOR_FILE="" # To specify if there's only one
8 VECTOR_FILES_ONE=false # Specify there's only one file
9
10 KMIN=2
11 KMAX=100
12 1 OUTDIR="exp/kmeans_euclidian/teacher-pvector-layer4"
config/xvector_config.sh
1 OUTDIR="exp/kmeans_euclidian/xvectors" File was deleted
2 DATADIR="data"
3 NEW_LSTDIR="${OUTDIR}/lst"
4
5 VECTOR_FILE="data/xvectors.txt" # To specify if there's only one
6 VECTOR_FILES_ONE=true # Specify there's only one file
7
8 KMIN=2
9 KMAX=100
10 1 OUTDIR="exp/kmeans_euclidian/xvectors"
extract-labels-pv-from-xv.sh
1 File was deleted
2
3 # Number of set
4 k=4
5
6
7 # Vector features file
8 DATADIR="data"
9
10 VECTOR_FILE_MASSEFFECT="${DATADIR}/xvectors.txt"
11
12 for kmean in 12 41 45 50 6 69 72 88
13 do
14 echo "KMEAN: ${kmean}"
15 # Dirs
16 EXP_DIR="exp/kmeans_euclidian/pv_from_xv/${k}/${kmean}"
17 CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl"
18
19
20 # Output dirs
21 OUTFILE_MASSEFFECT="data/pv_from_xv/saved_clustered/masseffect_clustered_${k}_${kmean}.txt"
22 echo "Extracting"
23 python3 bin/extract_kmeans.py "${CLUSTERING}" \
24 "${VECTOR_FILE_MASSEFFECT}" \
25 --outfile "$OUTFILE_MASSEFFECT"
26 echo "End extracting"
27 done
28 1
extract-labels.sh
1 File was deleted
2
3 # Number of set
4 k=4
5 kmean=88
6
7
8 # Vector features file
9 VECTOR_FILE_MASSEFFECT="data/xvectors.txt"
10
11
12 # Dirs
13 EXP_DIR="exp/kmeans_euclidian/xvectors/${k}/${kmean}"
14 CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl"
15
16
17 # Output dirs
18 OUTFILE_MASSEFFECT="data/xvectors/saved_clustered/masseffect_clustered_xvectors_${k}_${kmean}.txt"
19
20 python3 bin/extract_kmeans.py "${CLUSTERING}" \
21 "${VECTOR_FILE_MASSEFFECT}" \
22 --outfile "$OUTFILE_MASSEFFECT"
23 1
rm-unused-files.sh
1 File was deleted
2 if [ $# -eq 1 ]
3 then
4 EXP_DIR="$1"
5 else
6 echo "Need to have one and only one argument. This argument is the exp directory."
7 exit 1
8 fi
9
10 for kfold in {1..4}
11 do
12 for k in {1..100}
13 do
14 rm ${EXP_DIR}/$kfold/$k/clustered_$k.txt
15 done
16 done
17 1
1 # 1 #
2 # This script aims to compute clustering 2 # This script aims to compute clustering
3 # 3 #
4 4
5 5
6 # -- CONFIGURATION 6 # -- CONFIGURATION
7 # THIS SCRIPT NEEDS THESE VARIABLES 7 # THIS SCRIPT NEEDS THESE VARIABLES
8 # Vector file 8 # Vector file
9 #VECTOR_FILE="" 9 #VECTOR_FILE=""
10 # Train list 10 # Train list
11 #TRAIN_LST=="" 11 #TRAIN_LST==""
12 # Val list 12 # Val list
13 #VAL_LST="" 13 #VAL_LST=""
14 # Exp directory 14 # Exp directory
15 #EXP_DIR="" 15 #EXP_DIR=""
16 # Metas file with type values 16 # Metas file with type values
17 #METAS_TYPE="" 17 #METAS_TYPE=""
18 # Metas file with character values 18 # Metas file with character values
19 #METAS_CHARACTER="" 19 #METAS_CHARACTER=""
20 20
21 21
22 #echo "VECTOR FILE: $VECTOR_FILE" 22 #echo "VECTOR FILE: $VECTOR_FILE"
23 #echo "TRAIN LIST: $TRAIN_LST" 23 #echo "TRAIN LIST: $TRAIN_LST"
24 #echo "VAL LIST: $VAL_LST" 24 #echo "VAL LIST: $VAL_LST"
25 #echo "EXP DIR: $EXP_DIR" 25 #echo "EXP DIR: $EXP_DIR"
26 #echo "METAS TYPE: $METAS_TYPE" 26 #echo "METAS TYPE: $METAS_TYPE"
27 #echo "METAS_CHARACTER: $METAS_CHARACTER" 27 #echo "METAS_CHARACTER: $METAS_CHARACTER"
28 28
29 29
30 30
31 # -- TRAIN KMEANS 31 # -- TRAIN KMEANS
32 echo "Clustering - ${kfold}" 32 echo "Clustering - ${kfold}"sss
33 python3 bin/cluster_kmeans.py "${VECTOR_FILE}" \ 33 python3 bin/cluster_kmeans.py "${VECTOR_FILE}" \
34 "${TRAIN_LST}" \ 34 "${TRAIN_LST}" \
35 "${EXP_DIR}" --kmin ${KMIN} --kmax ${KMAX} 35 "${EXP_DIR}" --kmin ${KMIN} --kmax ${KMAX}
36 36
37 37
38 38
39 for k in $(seq ${KMIN} 1 ${KMAX}) 39 for k in $(seq ${KMIN} 1 ${KMAX})
40 do 40 do
41 SUB_EXP_DIR="${EXP_DIR}/${k}" 41 SUB_EXP_DIR="${EXP_DIR}/${k}"
42 42
43 # -- EXTRACT KMEANS VALUES 43 # -- EXTRACT KMEANS VALUES
44 echo "Kmeans Measuring and extraction - ${k}" 44 echo "Kmeans Measuring and extraction - ${k}"
45 python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ 45 python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \
46 "${VECTOR_FILE}" \ 46 "${VECTOR_FILE}" \
47 --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" 47 --outfile "${SUB_EXP_DIR}/clustered_${k}.txt"
48 # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR 48 # -- MEASURES AND PLOT WITH RESPECT TO CHARACTER VAR
49 # Measures 49 # Measures
50 python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ 50 python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
51 "${METAS_CHARACTER}" \ 51 "${METAS_CHARACTER}" \
52 "${TRAIN_LST}" \ 52 "${TRAIN_LST}" \
53 "${VAL_LST}" \ 53 "${VAL_LST}" \
54 --outfile "${SUB_EXP_DIR}/measures.json" 54 --outfile "${SUB_EXP_DIR}/measures.json"
55 55
56 # Plot count matrix for train 56 # Plot count matrix for train
57 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ 57 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
58 ${VECTOR_FILE} \ 58 ${VECTOR_FILE} \
59 ${TRAIN_LST} \ 59 ${TRAIN_LST} \
60 --outfile "${SUB_EXP_DIR}/train_count_matrix.pdf" 60 --outfile "${SUB_EXP_DIR}/train_count_matrix.pdf"
61 61
62 # Plot count matrix for val 62 # Plot count matrix for val
63 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ 63 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
64 ${VECTOR_FILE} \ 64 ${VECTOR_FILE} \
65 ${VAL_LST} \ 65 ${VAL_LST} \
66 --outfile "${SUB_EXP_DIR}/val_count_matrix.pdf" 66 --outfile "${SUB_EXP_DIR}/val_count_matrix.pdf"
67 67
68 # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR 68 # -- MEASURES AND PLOT WITH RESPECT TO TYPE VAR
69 # Measures 69 # Measures
70 python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ 70 python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
71 "${METAS_TYPE}" \ 71 "${METAS_TYPE}" \
72 "${TRAIN_LST}" \ 72 "${TRAIN_LST}" \
73 "${VAL_LST}" \ 73 "${VAL_LST}" \
74 --outfile "${SUB_EXP_DIR}/measures_type.json" 74 --outfile "${SUB_EXP_DIR}/measures_type.json"
75 75
76 # This script plot the count matrix of the train set 76 # This script plot the count matrix of the train set
77 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ 77 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
78 "${METAS_TYPE}" \ 78 "${METAS_TYPE}" \
79 "${TRAIN_LST}" \ 79 "${TRAIN_LST}" \
80 --outfile "${SUB_EXP_DIR}/train_count_matrix_type.pdf" 80 --outfile "${SUB_EXP_DIR}/train_count_matrix_type.pdf"
81 81
82 # This script plot the count matrix of the validation set 82 # This script plot the count matrix of the validation set
83 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ 83 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
84 "${METAS_TYPE}" \ 84 "${METAS_TYPE}" \
85 "${VAL_LST}" \ 85 "${VAL_LST}" \
86 --outfile "${SUB_EXP_DIR}/val_count_matrix_type.pdf" 86 --outfile "${SUB_EXP_DIR}/val_count_matrix_type.pdf"
87 87
88 88
89 # -- MEASURES AND PLOT WITH RESPECT TO LANG VAR 89 # -- MEASURES AND PLOT WITH RESPECT TO LANG VAR
90 # Measures 90 # Measures
91 python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ 91 python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
92 "${METAS_LANG}" \ 92 "${METAS_LANG}" \
93 "${TRAIN_LST}" \ 93 "${TRAIN_LST}" \
94 "${VAL_LST}" \ 94 "${VAL_LST}" \
95 --outfile "${SUB_EXP_DIR}/measures_lang.json" 95 --outfile "${SUB_EXP_DIR}/measures_lang.json"
96 96
97 # This script plot the count matrix of the train set 97 # This script plot the count matrix of the train set
98 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ 98 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
99 "${METAS_LANG}" \ 99 "${METAS_LANG}" \
100 "${TRAIN_LST}" \ 100 "${TRAIN_LST}" \
101 --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf" 101 --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf"
102 102
103 # This script plot the count matrix of the validation set 103 # This script plot the count matrix of the validation set
104 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ 104 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
105 "${METAS_LANG}" \ 105 "${METAS_LANG}" \
106 "${VAL_LST}" \ 106 "${VAL_LST}" \
107 --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf" 107 --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf"
108 108
109 done 109 done
110 110
111 111
1 # Pour le moment, le run ne fait qu'executer 1 # Pour le moment, le run ne fait qu'executer
2 # quelques petites commandes que l'on souhaite 2 # quelques petites commandes que l'on souhaite
3 # tester. 3 # tester.
4 4
5 OUTDIR="exp/kmeans_euclidian/teacher-pvector-1" 5 set -e
6
7 OUTDIR="exp/kmeans_euclidian/ivectors"
6 EXP_DIR=${OUTDIR} 8 EXP_DIR=${OUTDIR}
7 DATADIR="data" 9 DATADIR="data"
8 NEW_LSTDIR="${OUTDIR}/lst" 10 NEW_LSTDIR="${OUTDIR}/lst"
9 11
10 kmin=2 12 kmin=2
11 kmax=100 13 kmax=100
12 14
13 if [ ! -d "$OUTDIR" ]; 15 if [ ! -d "$OUTDIR" ];
14 then 16 then
15 mkdir -p $OUTDIR 17 mkdir -p $OUTDIR
16 fi 18 fi
17 19
18 if [ ! -d "$NEW_LSTDIR" ]; 20 if [ ! -d "$NEW_LSTDIR" ];
19 then 21 then
20 mkdir -p $NEW_LSTDIR 22 mkdir -p $NEW_LSTDIR
21 fi 23 fi
22 24
23 for kfold in {1..4} 25 for kfold in {1..4}
24 do 26 do
25 pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt" 27 #pvector_file="${DATADIR}/pvectors_1rst/pvectors_teacher_${kfold}.txt"
26 VECTOR_FILE=$pvector_file 28 VECTOR_FILE="${DATADIR}/ivectors.txt"
27 lst_dir="${DATADIR}/pvectors_1rst/lst" 29 lst_dir="${DATADIR}/pvectors_1rst/lst"
28 output_kfold="${OUTDIR}/${kfold}" 30 output_kfold="${OUTDIR}/${kfold}"
29 31
30 #python3 "bin/replace_label.py" \ 32 #python3 "bin/replace_label.py" \
31 # "${DATADIR}/masseffect.lst" \ 33 # "${DATADIR}/masseffect.lst" \
32 # "${DATADIR}/character_information.csv" \ 34 # "${DATADIR}/character_information.csv" \
33 # --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \ 35 # --field "type" --lst "data/pvectors_1rst/lst/train_${kfold}.lst" \
34 # --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst" 36 # --outfile "${NEW_LSTDIR}/train_${kfold}_type.lst"
35 37
36 #python3 "bin/replace_label.py" \ 38 #python3 "bin/replace_label.py" \
37 # "${DATADIR}/masseffect.lst" \ 39 # "${DATADIR}/masseffect.lst" \
38 # "${DATADIR}/character_information.csv" \ 40 # "${DATADIR}/character_information.csv" \
39 # --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \ 41 # --field "type" --lst "data/pvectors_1rst/lst/val_${kfold}.lst" \
40 # --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst" 42 # --outfile "${NEW_LSTDIR}/val_${kfold}_type.lst"
41 43
42 #cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst" 44 #cat "${NEW_LSTDIR}/train_${kfold}_type.lst" "${NEW_LSTDIR}/val_${kfold}_type.lst" > "${NEW_LSTDIR}/metas_${kfold}_type.lst"
43 TRAIN_LST=${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst 45 TRAIN_LST=${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst
44 VAL_LST=${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst 46 VAL_LST=${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst
45 TRAIN_LANG_LST=${NEW_LSTDIR}/train_${kfold}_lang.lst 47 TRAIN_LANG_LST=${NEW_LSTDIR}/train_${kfold}_lang.lst
46 VAL_LANG_LST=${NEW_LSTDIR}/val_${kfold}_lang.lst 48 VAL_LANG_LST=${NEW_LSTDIR}/val_${kfold}_lang.lst
47 METAS_LANG=${NEW_LSTDIR}/metas_${kfold}_lang.lst 49 METAS_LANG=${NEW_LSTDIR}/metas_${kfold}_lang.lst
48 50
49 # EXTRACT LANGUAGE INFORMATION 51 # EXTRACT LANGUAGE INFORMATION
50 awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST} 52 awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST}
51 echo "VAL EXTRACT LANGUAGE INFO DONE" 53 echo "VAL EXTRACT LANGUAGE INFO DONE"
52 awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST} 54 awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST}
53 echo "TRAIN EXTRACT LANGUAGE INFO DONE" 55 echo "TRAIN EXTRACT LANGUAGE INFO DONE"
54 cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}" 56 cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}"
55 echo "GLOBAL EXTRACT LANGUAGE INFO DONE" 57 echo "GLOBAL EXTRACT LANGUAGE INFO DONE"
56 58
57 59
58 echo "Clustering - ${kfold}" 60 echo "Clustering - ${kfold}"
59 61
60 for k in $(seq ${kmin} 1 ${kmax}) 62 for k in $(seq ${kmin} 1 ${kmax})
61 do 63 do
62 echo "Kmeans Measuring and ploting - ${k}" 64 echo "Kmeans Measuring and ploting - ${k}"
63 65
64 SUB_EXP_DIR="${EXP_DIR}/${kfold}/${k}" 66 SUB_EXP_DIR="${EXP_DIR}/${kfold}/${k}"
65 67
66 # -- EXTRACT CLUSTERING LABELS 68 # -- EXTRACT CLUSTERING LABELS
67 python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \ 69 python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \
68 "${VECTOR_FILE}" \ 70 "${VECTOR_FILE}" \
69 --outfile "${SUB_EXP_DIR}/clustered_${k}.txt" 71 --outfile "${SUB_EXP_DIR}/clustered_${k}.txt"
70 72
71 # -- MEASURES AND PLOT WITH RESPECT TO LANG VAR 73 # -- MEASURES AND PLOT
72 # Measures 74 source steps/measure_clustering_char.sh
73 python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \ 75 source steps/measure_clustering_type.sh
74 "${METAS_LANG}" \ 76 source steps/measure_clustering_lang.sh
75 "${TRAIN_LST}" \
76 "${VAL_LST}" \
77 --outfile "${SUB_EXP_DIR}/measures_lang.json"
78 77
79 # This script plot the count matrix of the train set 78 rm ${SUB_EXP_DIR}/clustered_${k}.txt
80 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
81 "${METAS_LANG}" \
82 "${TRAIN_LST}" \
83 --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf"
84
85 # This script plot the count matrix of the validation set
86 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
87 "${METAS_LANG}" \
88 "${VAL_LST}" \
89 --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf"
90
91 rm ${SUB_EXP_DIR}/clustered_${k}.txt
92 #python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \
93 # "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \
94 # "${lst_dir}/val_${kfold}.lst" \
95 # --outfile "${output_kfold}/${k}/measures_type.json"
96
97 # This script plot the count matrix of the train set
98 #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
99 # ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/train_${kfold}.lst \
100 # --outfile ${output_kfold}/${k}/train_count_matrix_type.pdf
101
102 # This script plot the count matrix of the validation set
103 #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
104 # ${NEW_LSTDIR}/metas_${kfold}_type.lst ${lst_dir}/val_${kfold}.lst \
105 # --outfile ${output_kfold}/${k}/val_count_matrix_type.pdf
106
107 # This script plot the count matrix of the train set
108 #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
109 # ${pvector_file} ${lst_dir}/train_${kfold}.lst \
110 # --outfile ${output_kfold}/${k}/train_count_matrix.pdf
111
112 # This script plot the count matrix of the validation set
113 #python3 bin/plot-count-matrix.py ${output_kfold}/${k}/clustered_${k}.txt \
114 # ${pvector_file} ${lst_dir}/val_${kfold}.lst \
115 # --outfile ${output_kfold}/${k}/val_count_matrix.pdf
116 done 79 done
117 done 80 done
File was created 1 python bin/cluster_kmeans.py ../data/skyrim/skyrim_ivectors.txt ../data/skyrim/skyrim.lst exp/kmeans_euclidian_skyrim/ivectors/ --kmin 1 --kmax 100
2
1 1
2 #OUTDIR="exp/test/pvector-2" 2 #OUTDIR="exp/test/pvector-2"
3 #DATADIR="data" 3 #DATADIR="data"
4 #NEW_LSTDIR="${OUTDIR}/lst" 4 #NEW_LSTDIR="${OUTDIR}/lst"
5 5
6 #VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher" 6 #VECTOR_FILES_BEGIN="${DATADIR}/pvectors_1rst/pvectors_teacher"
7 #VECTOR_FILES_END=".txt" 7 #VECTOR_FILES_END=".txt"
8 #VECTOR_FILE="" # To specify if there's only one 8 #VECTOR_FILE="" # To specify if there's only one
9 #VECTOR_FILES_ONE=false # Specify there's only one file 9 #VECTOR_FILES_ONE=false # Specify there's only one file
10 10
11 #KMIN=2 11 #KMIN=2
12 #KMAX=100 12 #KMAX=100
13 13
14 # -- LOAD CONFIG FILE 14 # -- LOAD CONFIG FILE
15 CONFIG_FILE="config.sh" 15 CONFIG_FILE="config.sh"
16 16
17 if [ $# -eq 1 ] 17 if [ $# -eq 1 ]
18 then 18 then
19 CONFIG_FILE="$1" 19 CONFIG_FILE="$1"
20 else 20 else
21 echo "Need to have one and only one argument" 21 echo "Need to have one and only one argument"
22 exit -1 22 exit -1
23 fi 23 fi
24 24
25 source $CONFIG_FILE 25 source $CONFIG_FILE
26 26
27 # -- DEFAULTS VALUES CONFIGURATION 27 # -- DEFAULTS VALUES CONFIGURATION
28 if [ -z "$VECTOR_FILES_ONE" ] 28 if [ -z "$VECTOR_FILES_ONE" ]
29 then 29 then
30 VECTOR_FILES_ONE=false 30 VECTOR_FILES_ONE=false
31 fi 31 fi
32 32
33 33
34 if [ -z "$METAS_CHARACTER" ]
35 then
36 METAS_CHARACTER="${DATADIR}/masseffect.lst"
37 fi
34 38
39
40 if [ -z "$CHAR_INFO" ]
41 then
42 CHAR_INFO="${DATADIR}/character_information.csv"
43 fi
44
35 # -- MAKE DIRECTORIES 45 # -- MAKE DIRECTORIES
36 if [ ! -d "$OUTDIR" ]; 46 if [ ! -d "$OUTDIR" ];
37 then 47 then
38 mkdir -p $OUTDIR 48 mkdir -p $OUTDIR
39 fi 49 fi
40 50
41 if [ ! -d "${NEW_LSTDIR}" ]; 51 if [ ! -d "${NEW_LSTDIR}" ];
42 then 52 then
43 mkdir -p ${NEW_LSTDIR} 53 mkdir -p ${NEW_LSTDIR}
44 fi 54 fi
45 55
46 56
47 # -- KFOLD MIN and MAX 57 # -- KFOLD MIN and MAX
48 if [ -z "$MIN_KFOLD" ] 58 if [ -z "$MIN_KFOLD" ]
49 then 59 then
50 MIN_KFOLD=1 60 MIN_KFOLD=1
51 fi 61 fi
52 62
53 if [ -z "$MAX_KFOLD" ] 63 if [ -z "$MAX_KFOLD" ]
54 then 64 then
55 MAX_KFOLD=4 65 MAX_KFOLD=4
56 fi 66 fi
57 67
58 # -- BEGIN BY KFOLD 68 # -- BEGIN BY KFOLD
59 for kfold in $(seq ${MIN_KFOLD} ${MAX_KFOLD}) 69 for kfold in $(seq ${MIN_KFOLD} ${MAX_KFOLD})
60 do 70 do
61 # Some usefull variable 71 # Some usefull variable
62 CHAR_INFO="${DATADIR}/character_information.csv"
63 TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst" 72 TRAIN_TYPE_LST="${NEW_LSTDIR}/train_${kfold}_type.lst"
64 VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst" 73 VAL_TYPE_LST="${NEW_LSTDIR}/val_${kfold}_type.lst"
65 TRAIN_LANG_LST="${NEW_LSTDIR}/train_${kfold}_lang.lst" 74 TRAIN_LANG_LST="${NEW_LSTDIR}/train_${kfold}_lang.lst"
66 VAL_LANG_LST="${NEW_LSTDIR}/val_${kfold}_lang.lst" 75 VAL_LANG_LST="${NEW_LSTDIR}/val_${kfold}_lang.lst"
67 76
68 # Configuration for the run clustering file 77 # Configuration for the run clustering file
69 if [ ${VECTOR_FILES_ONE} == false ] 78 if [ ${VECTOR_FILES_ONE} == false ]
70 then 79 then
71 VECTOR_FILE="${VECTOR_FILES_BEGIN}_${kfold}${VECTOR_FILES_END}" 80 VECTOR_FILE="${VECTOR_FILES_BEGIN}_${kfold}${VECTOR_FILES_END}"
72 fi 81 fi
73 82
74 TRAIN_LST="${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst" 83 TRAIN_LST="${MOTHER_LST_DIR}/lst/train_${kfold}.lst"
75 VAL_LST="${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst" 84 VAL_LST="${MOTHER_LST_DIR}/lst/val_${kfold}.lst"
76 EXP_DIR="${OUTDIR}/${kfold}" 85 EXP_DIR="${OUTDIR}/${kfold}"
77 METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst" 86 METAS_TYPE="${NEW_LSTDIR}/metas_${kfold}_type.lst"
78 METAS_CHARACTER="${DATADIR}/masseffect.lst"
79 METAS_LANG="${NEW_LSTDIR}/metas_${kfold}_lang.lst" 87 METAS_LANG="${NEW_LSTDIR}/metas_${kfold}_lang.lst"
80 88
81 89
82 if [ ! -d "${EXP_DIR}" ]; 90 if [ ! -d "${EXP_DIR}" ];
83 then 91 then
84 mkdir -p ${EXP_DIR} 92 mkdir -p ${EXP_DIR}
85 fi 93 fi
86 94
87 95
88 # EXTRACT TYPE INFORMATION 96 # EXTRACT TYPE INFORMATION
89 echo "Extracting character information" 97 echo "Extracting character information"
90 echo "Replace in train" 98 echo "Replace in train"
91 python3 "bin/replace_label.py" \ 99 python3 "bin/replace_label.py" \
92 "${METAS_CHARACTER}" \ 100 "${METAS_CHARACTER}" \
93 "${CHAR_INFO}" \ 101 "${CHAR_INFO}" \
94 --field "type" \ 102 --field "type" \
95 --lst "${TRAIN_LST}" \ 103 --lst "${TRAIN_LST}" \
96 --outfile "${TRAIN_TYPE_LST}" 104 --outfile "${TRAIN_TYPE_LST}"
97 105
98 echo "Replace in val" 106 echo "Replace in val"
99 python3 "bin/replace_label.py" \ 107 python3 "bin/replace_label.py" \
100 "${METAS_CHARACTER}" \ 108 "${METAS_CHARACTER}" \
101 "${CHAR_INFO}" \ 109 "${CHAR_INFO}" \
102 --field "type" \ 110 --field "type" \
103 --lst "${VAL_LST}" \ 111 --lst "${VAL_LST}" \
104 --outfile "${VAL_TYPE_LST}" 112 --outfile "${VAL_TYPE_LST}"
105 113
106 echo "Merge them" 114 echo "Merge them"
107 cat "${TRAIN_TYPE_LST}" "${VAL_TYPE_LST}" > "${METAS_TYPE}" 115 cat "${TRAIN_TYPE_LST}" "${VAL_TYPE_LST}" > "${METAS_TYPE}"
108 116
109 # EXTRACT LANGUAGE INFORMATION 117 # EXTRACT LANGUAGE INFORMATION
110 echo "Language info for train" 118 echo "Language info for train"
111 awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST} 119 awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST}
112 echo "Language info for val" 120 echo "Language info for val"
113 awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST} 121 awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST}
114 122
115 echo "Merge them" 123 echo "Merge them"
116 cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}" 124 cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}"
117 125
118 echo "Then Run Clustering" 126 echo "Then Run Clustering"
119 source "run-clustering.sh" 127 source "run-clustering.sh"
120 done 128 done
121 129
122 # Regroup measures with respect to character classes 130 # Regroup measures with respect to character classes
123 echo "Regrouping measures with respect to character classes" 131 echo "Regrouping measures with respect to character classes"
124 python3 "bin/regroup-measures.py" ${OUTDIR} 132 python3 "bin/regroup-measures.py" ${OUTDIR}
125 133
126 # Regroup measures with respect to type classes 134 # Regroup measures with respect to type classes
127 echo "Regrouping measures with respect to type classes" 135 echo "Regrouping measures with respect to type classes"
128 python3 "bin/regroup-measures.py" ${OUTDIR} --suffix "_type" --measurefile "measures_type.json" 136 python3 "bin/regroup-measures.py" ${OUTDIR} --suffix "_type" --measurefile "measures_type.json"
129 137
File was created 1
2 for kfold in `seq 1 4`
3 do
4 echo "KFOLD: ${kfold}"
5 source run.sh
6 done
7
8
run_without_kfold.sh
File was created 1
2 for k in $(seq ${KMIN} 1 ${KMAX})
3 do
4 SUB_EXP_DIR="${EXP_DIR}/${k}"
5
6 # -- EXTRACT KMEANS VALUES
7 echo "Kmeans Measuring and extraction - ${k}"
8 python3 bin/extract_kmeans.py "${SUB_EXP_DIR}/clustering_${k}.pkl" \
9 "${VECTOR_FILE}" \
10 --outfile "${SUB_EXP_DIR}/clustered_${k}.txt"
11
12 python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
13 "${METAS_CHARACTER}" \
14 "${TRAIN_LST}" \
15 "${VAL_LST}" \
16 --outfile "${SUB_EXP_DIR}/measures.json"
steps/extract_cluster_file.sh
File was created 1
2 for kfold in `seq 1 4`
3 do
4 source $1
5 vector_file=${VECTOR_FILE}
6 echo "kfold: $kfold"
7 for kmean in `seq 2 100`
8 do
9 echo "kmean: $kmean"
10 exp_dir="${OUTDIR}/${kfold}/${kmean}"
11 clustering="${exp_dir}/clustering_${kmean}.pkl"
12 save_loc="${exp_dir}"
13 saved_txt="${save_loc}/masseffect_clustered.txt"
14 saved_lst="${save_loc}/masseffect_clustered.lst"
15
16 python3 bin/extract_kmeans.py "${clustering}" \
17 "${vector_file}" \
18 --outfile "${saved_txt}"
19
20 cat ${saved_txt} | cut -d" " -f1 > ${saved_lst}
21
22 python3 bin/replace-features.py "${ORIGINAL_VECTOR_FILE}" "${saved_txt}"
23 done
24 done
25
steps/extract_cluster_file_skyrim.sh
File was created 1
2 source $1
3 vector_file=${VECTOR_FILE}
4 echo "kfold: $kfold"
5 for kmean in `seq 2 100`
6 do
7 echo "kmean: $kmean"
8 exp_dir="${OUTDIR}/${kmean}"
9 clustering="${exp_dir}/clustering_${kmean}.pkl"
10 save_loc="${exp_dir}"
11 saved_txt="${save_loc}/masseffect_clustered.txt"
12 saved_lst="${save_loc}/masseffect_clustered.lst"
13
14 python3 bin/extract_kmeans.py "${clustering}" \
15 "${vector_file}" \
16 --outfile "${saved_txt}"
17
18 cat ${saved_txt} | cut -d" " -f1 > ${saved_lst}
19
20 python3 bin/replace-features.py "${ORIGINAL_VECTOR_FILE}" "${saved_txt}"
21 done
22
23
steps/extract_language_lst.sh
File was created 1 DATADIR="data"
2 OUTDIR="exp/kmeans_euclidian/ivectors"
3 NEW_LSTDIR="${OUTDIR}/lst"
4
5 TRAIN_LST=${DATADIR}/pvectors_1rst/lst/train_${kfold}.lst
6 VAL_LST=${DATADIR}/pvectors_1rst/lst/val_${kfold}.lst
7 TRAIN_LANG_LST=${NEW_LSTDIR}/train_${kfold}_lang.lst
8 VAL_LANG_LST=${NEW_LSTDIR}/val_${kfold}_lang.lst
9 METAS_LANG=${NEW_LSTDIR}/metas_${kfold}_lang.lst
10
11
12 awk '$2=$1' FS=, OFS=, ${TRAIN_LST} > ${TRAIN_LANG_LST}
13 echo "VAL EXTRACT LANGUAGE INFO DONE"
14 awk '$2=$1' FS=, OFS=, ${VAL_LST} > ${VAL_LANG_LST}
15 echo "TRAIN EXTRACT LANGUAGE INFO DONE"
16 cat "${TRAIN_LANG_LST}" "${VAL_LANG_LST}" > "${METAS_LANG}"
17 echo "GLOBAL EXTRACT LANGUAGE INFO DONE"
steps/measure_clustering_char.sh
File was created 1
2 python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \
3 "${lst_dir}/trainval_${kfold}.lst" "${lst_dir}/train_${kfold}.lst" \
4 "${lst_dir}/val_${kfold}.lst" \
5 --outfile "${output_kfold}/${k}/measures.json"
6
7
8 # This script plot the count matrix of the train set
9 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
10 "${lst_dir}/train_${kfold}.lst" \
11 "${lst_dir}/train_${kfold}.lst" \
12 --outfile "${SUB_EXP_DIR}/train_count_matrix.pdf"
13
14 # This script plot the count matrix of the validation set
15 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
16 "${lst_dir}/val_${kfold}.lst" \
17 "${lst_dir}/val_${kfold}.lst" \
18 --outfile "${SUB_EXP_DIR}/val_count_matrix.pdf"
19
steps/measure_clustering_lang.sh
File was created 1
2 python3 bin/measure_clustering.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
3 "${METAS_LANG}" \
4 "${TRAIN_LST}" \
5 "${VAL_LST}" \
6 --outfile "${SUB_EXP_DIR}/measures_lang.json"
7
8 # This script plot the count matrix of the train set
9 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
10 "${METAS_LANG}" \
11 "${TRAIN_LST}" \
12 --outfile "${SUB_EXP_DIR}/train_count_matrix_lang.pdf"
13
14 # This script plot the count matrix of the validation set
15 python3 bin/plot-count-matrix.py "${SUB_EXP_DIR}/clustered_${k}.txt" \
16 "${METAS_LANG}" \
17 "${VAL_LST}" \
18 --outfile "${SUB_EXP_DIR}/val_count_matrix_lang.pdf"
steps/measure_clustering_type.sh
File was created 1 python3 bin/measure_clustering.py "${output_kfold}/${k}/clustered_${k}.txt" \
2 "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \
3 "${lst_dir}/val_${kfold}.lst" \
4 --outfile "${output_kfold}/${k}/measures_type.json"
5
6 # This script plot the count matrix of the train set
7 python3 bin/plot-count-matrix.py "${output_kfold}/${k}/clustered_${k}.txt" \
8 "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/train_${kfold}.lst" \
9 --outfile "${output_kfold}/${k}/train_count_matrix_type.pdf"
10
11 # This script plot the count matrix of the validation set
12 python3 bin/plot-count-matrix.py "${output_kfold}/${k}/clustered_${k}.txt" \
13 "${NEW_LSTDIR}/metas_${kfold}_type.lst" "${lst_dir}/val_${kfold}.lst" \
14 --outfile "${output_kfold}/${k}/val_count_matrix_type.pdf"
15
steps/save_clusters_file.sh
File was created 1
2 vector_file="data/xvectors.txt"
3
4 for kfold in `seq 1 4`
5 do
6 echo "kfold: $kfold"
7 for kmean in `seq 2 100`
8 do
9 echo "kmean: $kmean"
10 exp_dir="exp/kmeans_euclidian/xvectors/${kfold}/${kmean}"
11 clustering="${exp_dir}/clustering_${kmean}.pkl"
12 save_loc="data/xvectors/saved_clustered/"
13 saved_txt="${save_loc}/masseffect_clustered_xvectors_${kfold}_${kmean}.txt"
14 saved_lst="${save_loc}/masseffect_clustered_xvectors_${kfold}_${kmean}.lst"
15
16 python3 bin/extract_kmeans.py "${clustering}" \
17 "${vector_file}" \
18 --outfile "${saved_txt}"
19
20 cat ${saved_txt} | cut -d" " -f1 > ${saved_lst}
21 done
22 done
23
utils/extract-labels.sh
File was created 1
2
3 # Number of set
4 k=4
5 kmean=88
6
7
8 # Vector features file
9 VECTOR_FILE_MASSEFFECT="data/xvectors.txt"
10
11
12 # Dirs
13 EXP_DIR="exp/kmeans_euclidian/xvectors/${k}/${kmean}"
14 CLUSTERING="${EXP_DIR}/clustering_${kmean}.pkl"
15
16
17 # Output dirs
18 OUTFILE_MASSEFFECT="data/xvectors/saved_clustered/masseffect_clustered_xvectors_${k}_${kmean}.txt"
19
20 python3 bin/extract_kmeans.py "${CLUSTERING}" \
21 "${VECTOR_FILE_MASSEFFECT}" \
22 --outfile "$OUTFILE_MASSEFFECT"
23
utils/rm-unused-files.sh
File was created 1
2 if [ $# -eq 1 ]
3 then
4 EXP_DIR="$1"
5 else
6 echo "Need to have one and only one argument. This argument is the exp directory."
7 exit 1
8 fi
9
10 for kfold in {1..4}
11 do
12 for k in {1..100}
13 do
14 rm ${EXP_DIR}/$kfold/$k/clustered_$k.txt
15 done
16 done
17
utils/transform_exp_to_kd.sh
File was created 1
2 # -- DESCRIPTION --
3 #
4 # This script aims to transform data in a shape that is
5 # usable mainly by knowledge distillation scripts.
6 #
7 # Firstly, it extracts clustering labels
8 # then change features with the given one
9 # and finally generate a list file.
10 #
11 # The pair features files and list file will be usable
12 # by the knowledge distillation system.
13 # --------------------
14
15
16 # -- CONFIGURATION --
17 # Configuration error
18 set -e
19
20 # KFOLD config
21 MIN_KFOLD=1
22 MAX_KFOLD=4
23
24 # KMEAN config
25 MIN_KMEAN=2
26 MAX_KMEAN=100
27
28 # Vector features file
29 DATADIR="data"
30 FEATURES_DIR="${DATADIR}/pv_from_xv"
31 FEATURES_PREFIX="me_pv_teacher"
32 FEATURES_SUFFIX=".txt"
33
34 EXP_DIR="exp/kmeans_euclidian/pv_from_xv"
35 VECTOR_FILE_MASSEFFECT="${DATADIR}/xvectors.txt"
36 OUTDIR="data/pv_from_xv/saved_clustered"
37
38 # -- CREATE DIRECTORIES
39 # OUTPUT DIRECTORY
40 if [ ! -d "${OUTDIR}" ]
41 then
42 mkdir -p ${OUTDIR}
43 fi
44
45
46 # -- FUNCTIONS --
47 # Definition of the transform function
48 function transform() {
49 # Define subdir variable
50 local SUB_EXP_DIR="${EXP_DIR}/${k}/${kmean}"
51
52 # Define features file variable
53 local INITIAL_VECTOR_FILE="${FEATURES_DIR}/${FEATURES_PREFIX}_${k}${FEATURES_SUFFIX}"
54
55 # Information of the current process
56 echo "[KFOLD, KMEAN]: [${k}, ${kmean}]"
57
58 # Define clustering model variable
59 local CLUSTERING="${SUB_EXP_DIR}/clustering_${kmean}.pkl"
60
61
62 # Define output file
63 local OUTFILE_MASSEFFECT="${OUTDIR}/masseffect_clustered_${k}_${kmean}.txt"
64
65 # Extracting clustering labels
66 echo "Extracting clustering labels"
67 python3 bin/extract_kmeans.py "${CLUSTERING}" \
68 "${INITIAL_VECTOR_FILE}" \
69 --outfile "${OUTFILE_MASSEFFECT}"
70
71 # Changing features
72 echo "Changing features"
73 python bin/replace-features.py ${VECTOR_FILE_MASSEFFECT} ${OUTFILE_MASSEFFECT}
74
75 # Extracting list file
76 cut -d' ' -f1 ${OUTFILE_MASSEFFECT} > "${OUTDIR}/masseffect_clustered_${k}_${kmean}.lst"
77 }
78
79
80 # -- MAIN LOOPS
81 for k in $(seq ${MIN_KFOLD} ${MAX_KFOLD})
82 do
83 for kmean in $(seq ${MIN_KMEAN} ${MAX_KMEAN})
84 do
85 transform
86 done
87 done
88