Commit ac78b07ea0ab18b7855f1b752e90fcac99440c98

Authored by Mathias Quillot
1 parent b8acebc1ed
Exists in master

All base bin files added

Showing 11 changed files with 751 additions and 0 deletions Inline Diff

bin/__pycache__/data.cpython-36.pyc
No preview for this file type
bin/cluster_kmeans.py
File was created 1 '''
2 This script aims in computing k-means for a given
3 data set.
4 '''
5
6 import argparse
7 import numpy as np
8 from sklearn.cluster import KMeans
9 from os import path
10
11 import pickle
12 from data import read_file, index_by_id
13
14 # -- ARGPARSE --
15 parser = argparse.ArgumentParser(description="Cluster with kmeans")
16 parser.add_argument("features", type=str, help="Features file")
17 parser.add_argument("list", type=str, help="List on which apply kmeans")
18 parser.add_argument("outdir", type=str, help="Output directory for k-means models")
19 parser.add_argument("--kmin", type=int, help="minimum k", default=2)
20 parser.add_argument("--kmax", type=int, help="maximum k", default=100)
21
22 args = vars(parser.parse_args())
23 FEATURES = args["features"]
24 LST = args["list"]
25 OUTDIR = args["outdir"]
26 KMIN = args["kmin"]
27 KMAX = args["kmax"]
28
29 # -- READE FILES --
30 features = read_file(FEATURES)
31 feat_ind = index_by_id(features)
32
33 lst = read_file(LST)
34
35 # -- TRANSFORM INTO NUMPY --
36 X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst])
37
38 Ks = range(KMIN, KMAX+1)
39 for k in Ks:
40 kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
41 pickle.dump(kmeans, open(path.join(OUTDIR, "clustering_" + str(k) + ".pkl"), "wb"))
42
43
bin/clustering_pvector.py
File was created 1 '''
2 The goal of this script is to apply a clustering to pvector in order to find new classes assigned for each utterance or frame.
3 This new class can be used for new training systems, replacing character classes for example by the calculatering classes from clustering.
4 We hope this will generate interesting classes that will help the system to understand the structure of the voices.
5
6 TODO: Change it in such a way as to take a number (1, 2, 3, 4) and calculate everything needed like clustering. Train on the train set and then project the test on this clustering in order to know to what cluster it belong to.
7 '''
8
9 import os
10 import numpy as np
11 from sklearn.cluster import KMeans
12 import matplotlib.pyplot as plt
13 import argparse
14 import pandas as pd
15 import pickle
16
17
18 '''
19 Return data in panda format version
20 '''
21 def read_vecfile(filepath, toy_version=False):
22 vectors = ""
23 metas = ""
24 with open(filepath, "r") as f:
25 for i, line in enumerate(f):
26 if toy_version == True and i > 100:
27 break
28 spl_line = line.split(" ")
29
30 if(len(vectors) == 0):
31 vectors = np.empty((0, len(spl_line[1:])), np.float32)
32 metas = np.empty((0, len(spl_line[0].split(","))))
33
34 # Then we add the current line to the data
35 metas = np.append(
36 metas,
37 np.asarray([spl_line[0].split(",")]),
38 axis=0)
39
40 vectors = np.append(
41 vectors,
42 np.asarray([spl_line[1:]], dtype=np.float32),
43 axis=0)
44 return (metas, vectors)
45
46 '''
47 Return list of metas of the listfile
48 '''
49 def read_lstfile(filepath, toy_version=False):
50 metas = np.empty((0, 4))
51 with open(filepath, "r") as f:
52 for i, line in enumerate(f):
53 if toy_version == True and i > 100:
54 break
55 metas = np.append(
56 metas,
57 np.asarray([line.rstrip('\n').split(",")]),
58 axis=0)
59 return metas
60
61 '''
62 Save a vector file from metas and vector values
63 '''
64 def save_file(filepath, metas, values=None):
65 with open(filepath, "w") as f:
66 for i in range(len(metas)):
67 metas_str = ",".join(str(v) for v in metas[i])
68 if not values == None:
69 try:
70 infos_str = " ".join(str(v) for v in values[i])
71 except TypeError as te:
72 infos_str = str(values[i])
73 f.write(metas_str + " " + infos_str + "\n")
74 else:
75 f.write(metas_str + "\n")
76
77 '''
78 Take the data and index them.
79 '''
80 def index_data(metas, vectors):
81 data = {}
82 data["en-us"] = {}
83 data["fr-fr"] = {}
84 for i, vector in enumerate(vectors):
85 meta = metas[i]
86 data[meta[0]][meta[3]] = {}
87 data[meta[0]][meta[3]]["metas"] = meta
88 data[meta[0]][meta[3]]["vector"] = vector
89 return data
90
91
92
93 '''
94 Récupère un sous ensemble des données de base à partir d'une
95 liste.
96 '''
97 def get_subdata(data, lst):
98 metas = ""
99 vectors = ""
100 for meta in lst:
101 vector = data[meta[0]][meta[3]]["vector"]
102 if(len(metas) == 0):
103 metas = np.empty((0, len(meta)))
104 vectors = np.empty((0, len(vector)), np.float64)
105 metas = np.append(
106 metas,
107 np.asarray([data[meta[0]][meta[3]]["metas"]]),
108 axis=0)
109 vectors = np.append(
110 vectors,
111 np.asarray([vector]),
112 axis=0)
113 return metas, vectors
114
115
116 '''
117 Apply clustering on data of filename.
118 Use a list to determine train et test using train valid, test.
119 Save the file with the given suffix.
120 Check the existence of the files before calculating and saving,
121 if the two files already exist, it will not calculate it again.
122
123 However: if one file is not present, this function will calculate
124 it again.
125
126 TODO: Add a variable to force the calculation of all the files
127 even if they exist.
128 '''
129 def apply_clustering(filename, dir_lst, dir_data, suffix_outfile):
130
131 # Applicate it for normal version
132 metas, vectors = read_vecfile(os.path.join(dir_data, filename), toy_version=False)
133 data = index_data(metas, vectors)
134
135 ### CURSOR
136 # Get Train
137 train_lst = read_lstfile(os.path.join(dir_lst, "train_" + str(NUMBER) + ".lst"))
138 train_metas, train_vectors = get_subdata(data, train_lst)
139
140 # Get Val
141 val_lst = read_lstfile(os.path.join(dir_lst, "val_" + str(NUMBER) + ".lst"))
142 val_metas, val_vectors = get_subdata(data, val_lst)
143
144 # Get Test
145 test_lst = read_lstfile(os.path.join(dir_lst, "test_" + str(NUMBER) + ".lst"))
146 test_metas, test_vectors = get_subdata(data, test_lst)
147
148 # Verif shapes
149 print("verif shapes")
150 print(train_metas.shape)
151 print(val_metas.shape)
152 print(test_metas.shape)
153
154 # Entrainer le k-means sur le train + val
155 #Ks = [12, 24, 48]
156
157 print("k=[", end="")
158 Ks = [6,12,24,48,64]
159 for k in Ks:
160 # Process the name
161 suffix = "_" + suffix_outfile if not suffix_outfile == "" else ""
162 k_str = "{:03d}".format(k) # K in string
163 filename_pickle = os.path.join(
164 DIR_DATA,
165 "clusters_trained_on_train_" +str(k_str)+ "_pickle_" + suffix + ".txt")
166 filename_clusters = os.path.join(
167 DIR_DATA,
168 "clusters_trained_on_train_" +str(k_str)+ suffix + ".txt")
169
170 # Check if on of the two file does not exist
171 condition = not(
172 os.path.exists(filename_pickle)
173 and os.path.exists(filename_clusters)
174 )
175
176 if condition:
177 print(str(k)+",", end=" ")
178 kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(
179 train_vectors)
180 test_pred = kmeans.predict(np.concatenate((val_vectors, test_vectors), axis=0))
181 metas_tosave = np.concatenate([train_metas, val_metas, test_metas], axis=0)
182 values_tosave = np.concatenate([kmeans.labels_, test_pred], axis=0)
183 metas_tosave[:, 1] = values_tosave # Replace char by clusters
184 save_file(filename_clusters, metas_tosave)
185 pickle.dump(kmeans, open( filename_pickle, "wb" ) )
186 print("]")
187
188 for NUMBER in range(1, 5):
189 print("JACKKNIFING NUMBER: " + str(NUMBER))
190 DIR_MAIN="exp/pvector-1"
191 DIR_DATA=os.path.join(DIR_MAIN, str(NUMBER))
192 DIR_LST=os.path.join(DIR_MAIN, "lst")
193 OUTFILE_NAME="clustering"
194
195 print("Calculating mass_effect_pvectors")
196 apply_clustering("masseffect_pvectors.txt",
197 dir_lst = os.path.join(DIR_MAIN, "lst"),
198 dir_data = DIR_DATA,
199 suffix_outfile = "")
200
201 print("Calculating mass_effect_pvectors_final")
202 apply_clustering("masseffect_pvectors_final.txt",
203 dir_lst = os.path.join(DIR_MAIN, "lst"),
204 dir_data = DIR_DATA,
205 suffix_outfile = "final")
206
File was created 1 '''
2 This module aim in loading and writing files.
3 Our files respect a specific format that
4 is not standard. This is why i hope these
5 function make the read of file easier.
6
7 For more information about the data, read
8 the README file please.
9 '''
10
11 import sys
12
13 def read_file(filepath):
14 '''
15 Read the file and return an array with pairs
16 where each pair is composed by the metas and the
17 features.
18 '''
19 data = []
20 with open(filepath, "r") as f:
21 for line in f:
22 splited = line.replace("\n", "").split(" ")
23 metas = splited[0].split(",")
24 features = splited[1:]
25 data.append((metas, features))
26 return data
27
28
29 def index_by(data, num_col):
30 '''
31 Allows the user to index data by number of columns.
32 '''
33 indexed = {}
34 for line in data:
35 metas = line[0]
36 features = line[1]
37 if metas[num_col] not in indexed:
38 indexed[metas[num_col]] = []
39 indexed[metas[num_col]].append((metas, features))
40 return indexed
41
42
43 def index_by_id(data):
44 '''
45 Allows the user to index data by id.
46 Index data by id consists in indexing two times
47 because data have two keys. On with the language
48 and the other one with the id of the sentence.
49 '''
50 indexed = {}
51 for line in data:
52 metas = line[0]
53 id_sen = metas[3]
54 lang = metas[0]
55 if lang not in indexed:
56 indexed[lang] = {}
57 indexed[lang][id_sen] = line
58 return indexed
59
60
61 def write_line(metas, features, f=sys.stdout):
62 '''
63 Just print the line. No need to specify a file.
64
65 metas: meta information on list
66 features: feature vector
67 f: file to write it
68 '''
69 print(",".join(metas) + " " + " ".join(features), file=f)
70
bin/extract_kmeans.py
File was created 1 '''
2 This script aims to extract k-means clustering from a
3 a priori trained k-means.
4 '''
5
6 import argparse
7 import numpy as np
8 import pickle
9 from data import read_file, index_by_id, write_line
10 import sys
11
12 # -- ARGPARSE --
13 parser = argparse.ArgumentParser(description="extract clusters")
14 parser.add_argument("model", type=str, help="k-means model pickle")
15 parser.add_argument("features", type=str, help="features")
16 parser.add_argument("list", type=str, help="list file")
17 parser.add_argument("--outfile", type=str, default=None, help="output file std")
18
19 args = vars(parser.parse_args())
20 MODEL = args["model"]
21 FEATURES = args["features"]
22 LST = args["list"]
23 OUTFILE = args["outfile"]
24
25 if OUTFILE == None:
26 OUTFILE = sys.stdout
27 else:
28 OUTFILE = open(OUTFILE, "w")
29
30 # -- READ FILE --
31 features = read_file(FEATURES)
32 feat_ind = index_by_id(features)
33
34 lst = read_file(LST)
35
36 kmeans = pickle.load(open(MODEL, "rb"))
37
38
39 # -- CONVERT TO NUMPY --
40 X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst])
41 predictions = kmeans.predict(X)
42
43 for i, line in enumerate(lst):
44 meta = line[0]
45 meta[1] = str(predictions[i])
46 write_line(
47 meta,
48 feat_ind[meta[0]][meta[3]][1],
49 OUTFILE
50 )
51
52 # -- CLOSE OUT FILE IF NECESSARY --
53 if not OUTFILE == sys.stdout:
54 OUTFILE.close()
bin/extract_vectors.py
File was created 1 '''
2 The goal of this script is to extract vectors from a list.
3 One file is the full content, and the list only enumerate the
4 vectors you want to keep.
5 '''
6
7 import os
8 import numpy as np
9 import argparse
10
11 parser = argparse.ArgumentParser(description='Extract a subset of vectors')
12 parser.add_argument('vectorsfile', type=str,
13 help='the path of the file containing the convectors')
14 parser.add_argument('listfile', type=str,
15 help='the path of the file containing the list of vectors kept')
16 parser.add_argument('-o', '--output', type=str,
17 default='a.out',
18 help='the path the output file containing the vectors kept')
19
20 args = parser.parse_args()
21
22 # Editing global variable
23 VECTOR_FILE = args.vectorsfile
24 LIST_FILE = args.listfile
25 OUTPUT_FILE = args.output
26
27 # READ VECTOR DATA
28 data = {}
29 data["en-us"] = {}
30 data["fr-fr"] = {}
31 with open(VECTOR_FILE, "r") as f:
32 for i, line in enumerate(f):
33 if TOY_VERSION == True and i > 100:
34 break
35 spl_line = line.split(" ")
36 if(len(pvectors) == 0):
37 pvectors = np.empty((0, len(spl_line[1:])), np.float32)
38 spl_meta = spl_line.split(",")
39 lang = spl_meta[0]
40 iden = spl_meta[3]
41 data[lang][iden] = line
42
43 # READ LIST AND WRITE NEW FILE
44 with open(LIST_FILE, "r") as f, open(OUTPUT_FILE, "w") as o:
45 for i, line in enumerate(LIST_FILE):
46 if TOY_VERSION == True and i > 100:
47 break
48 spl_meta = line.split(",")
49 lang = spl_meta[0]
50 iden = spl_meta[3]
51 OUTPUT_FILE.write(data[lang][iden])
52
53
File was created 1 '''
2 Take a file and plot its data onto a 2d or 3d axis depending on the data.
3 '''
4
5 import os
6 import numpy as np
7 from sklearn.cluster import KMeans
8 import matplotlib.pyplot as plt
9 import argparse
10 import json
11
12 # Defining argparse
13 parser = argparse.ArgumentParser(prog='Plotter', description='Plot a file of 2d ou 3d dimension')
14 parser.add_argument('filepath', type=str,
15 help='the path of the file you want to plot')
16 parser.add_argument('-o-', '--output', type=str,
17 default='plot.pdf',
18 help='the path of the ploted file')
19 parser.add_argument('-t', '--toy', action='store_true',
20 help='test the script on a toy example. Do not test all the file content')
21
22 args = parser.parse_args()
23
24 # Editing global variable
25 FILE_PATH=args.filepath
26 OUTFILE_PATH = args.output
27 TOY_VERSION = args.toy
28
29 # Defining vectors with default number of column
30 vectors = np.empty((0, 64), np.float32)
31 metas = np.empty((0, 4), np.float32)
32
33 # READ DATA
34 with open(os.path.join(FILE_PATH), "r") as f:
35 for i, line in enumerate(f):
36 if TOY_VERSION == True and i > 100:
37 break
38 spl_line = line.split(" ")
39 if(len(vectors) == 0):
40 vectors = np.empty((0, len(spl_line[1:])), np.float32)
41 metas = np.append(
42 metas,
43 np.asarray([spl_line[0].split(",")]),
44 axis=0)
45
46 vectors = np.append(
47 vectors,
48 np.asarray([spl_line[1:]], dtype=np.float32),
49 axis=0)
50
51 vectors_T = np.transpose(vectors)
52
53
54 # Plot the file
55 plt.plot(vectors, 'ro')
56 fig, ax = plt.subplots()
57
58 if(vectors_T.shape[0] == 2):
59 ax.scatter(vectors_T[0], vectors_T[1]) #c=close, s=volume, alpha=0.5)
60 else:
61 ax.scatter(vectors_T[0], vectors_T[1], vectors_T[2])
62
63 ax.set_xlabel('Axe 1', fontsize=15)
64 ax.set_ylabel('Axe 2', fontsize=15)
65
66 if(vectors_T.shape[0] == 3):
67 ax.set_zlabel('Axe 3', fontsize15=15)
68
69 ax.set_title('Volume and percent change')
70 plt.savefig(OUTFILE_PATH)
71
bin/plot_character.py
File was created 1 '''
2 Take a file and plot its data onto a 2d or 3d axis depending on the data.
3 Automatic detection of the number of dimension.
4 '''
5
6 import os
7 import numpy as np
8 from sklearn.cluster import KMeans
9 import matplotlib.pyplot as plt
10 import argparse
11 import json
12 import pandas as pd
13
14 # Defining useful functions
15
16 '''
17 Read the file whose content is metas and vectors.
18 Returns two numpy array : (metas, vectors)
19
20 '''
21 def read_vector_file(filename, toy_version=False):
22 vectors = np.empty((0, 1), np.float32)
23 metas = np.empty((0, 4), np.float32)
24 with open(filename, "r") as f:
25 for i, line in enumerate(f):
26 if toy_version == True and i > 100:
27 break
28 spl_line = line.split(" ")
29 if(len(vectors) == 0):
30 vectors = np.empty((0, len(spl_line[1:])), np.float32)
31 metas = np.append(
32 metas,
33 np.asarray([spl_line[0].split(",")]),
34 axis=0)
35
36 vectors = np.append(
37 vectors,
38 np.asarray([spl_line[1:]], dtype=np.float32),
39 axis=0)
40 return (metas, vectors)
41
42
43 # Defining argparse
44 parser = argparse.ArgumentParser(description='Plot a file of 2d ou 3d dimension')
45 parser.add_argument('vectorfile', type=str,
46 help='the path of the vectors file')
47 parser.add_argument('-o-', '--output', type=str,
48 default='plot.pdf',
49 help='the path of the ploted file')
50 parser.add_argument('-t', '--toy', action='store_true',
51 help='test the script on a toy example. Do not test all the file content')
52
53 args = parser.parse_args()
54
55 # Editing global variable
56 VECTORFILE_PATH=args.vectorfile
57 OUTFILE_PATH = args.output
58 TOY_VERSION = args.toy
59
60
61 # Get Vectors
62 metas, vectors = read_vector_file(VECTORFILE_PATH, toy_version = TOY_VERSION)
63 vectors_T = np.transpose(vectors)
64
65 print("Number of characters: " + str(len(np.unique(np.transpose(metas)[1]))))
66 df = pd.DataFrame(dict(
67 x=vectors_T[0],
68 y=vectors_T[1],
69 character=np.transpose(metas)[1]
70 ))
71
72 groups = df.groupby('character')
73
74 # Plot
75 fig, ax = plt.subplots()
76
77 for character, group in groups:
78 ax.plot(group.x, group.y, marker='o', linestyle='', ms=2, label=character)
79 plt.savefig(OUTFILE_PATH)
80 print("Your plot is saved well (no check of this affirmation)")
81
bin/plot_clusters.py
File was created 1 '''
2 Take a file and plot its data onto a 2d or 3d axis depending on the data.
3 '''
4
5 import os
6 import numpy as np
7 from sklearn.cluster import KMeans
8 import matplotlib.pyplot as plt
9 import argparse
10 import json
11 import pandas as pd
12
13 # Defining useful functions
14
15 '''
16 Read the file whose content is metas and vectors.
17 Returns two numpy array : (metas, vectors)
18
19 '''
20 def read_vector_file(filename, toy_version=False):
21 vectors = np.empty((0, 1), np.float32)
22 metas = np.empty((0, 4), np.float32)
23 with open(filename, "r") as f:
24 for i, line in enumerate(f):
25 if toy_version == True and i > 100:
26 break
27 spl_line = line.split(" ")
28 if(len(vectors) == 0):
29 vectors = np.empty((0, len(spl_line[1:])), np.float32)
30 metas = np.append(
31 metas,
32 np.asarray([spl_line[0].split(",")]),
33 axis=0)
34
35 vectors = np.append(
36 vectors,
37 np.asarray([spl_line[1:]], dtype=np.float32),
38 axis=0)
39 return (metas, vectors)
40
41
42 '''
43 Check if the two given files have the same order.
44 '''
45 def check_files(vector_file, cluster_file):
46 with open(vector_file, "r") as f1, open(cluster_file, "r") as f2:
47 for line1, line2 in zip(f1, f2):
48 line1_str = line1.strip()
49 line2_str = line2.strip()
50 metas1 = line1_str.split(" ")[0].split(",")
51 metas2 = line2_str.split(" ")[0].split(",")
52 if(not metas1[0] == metas2[0] or not metas1[3] == metas2[3]):
53 return False
54 return True
55
56
57
58
59
60 # Defining argparse
61 parser = argparse.ArgumentParser(prog='Plotter', description='Plot a file of 2d ou 3d dimension')
62 parser.add_argument('clusterfile', type=str,
63 help='the path of the cluster file')
64 parser.add_argument('vectorfile', type=str,
65 help='the path of the vectors file')
66 parser.add_argument('-o-', '--output', type=str,
67 default='plot.pdf',
68 help='the path of the ploted file')
69 parser.add_argument('-t', '--toy', action='store_true',
70 help='test the script on a toy example. Do not test all the file content')
71
72 args = parser.parse_args()
73
74 # Editing global variable
75 CLUSTERFILE_PATH=args.clusterfile
76 VECTORFILE_PATH=args.vectorfile
77 OUTFILE_PATH = args.output
78 TOY_VERSION = args.toy
79
80 if check_files(VECTORFILE_PATH, CLUSTERFILE_PATH) == False:
81 print("Les fichiers ne sont pas dans le meme ordre. Dans une version futur, cela générera une exception. On stop le processus.")
82 exit(1)
83
84 # Get Vectors
85 metas, vectors = read_vector_file(VECTORFILE_PATH, toy_version = TOY_VERSION)
86 vectors_T = np.transpose(vectors)
87
88 # Get Clusters
89 metas, clusters = read_vector_file(CLUSTERFILE_PATH, toy_version = TOY_VERSION)
90
91 #print(np.transpose(clusters)[0])
92 #print(np.transpose(metas)[0])
93 df = pd.DataFrame(dict(
94 x=vectors_T[0],
95 y=vectors_T[1],
96 cluster=np.transpose(clusters)[0]
97 ))
98
99 groups = df.groupby('cluster')
100
101 # Plot
102 fig, ax = plt.subplots()
103
104 for cluster, group in groups:
105 ax.plot(group.x, group.y, marker='o', linestyle='', ms=2, label=cluster)
106 ax.legend()
107 plt.savefig(OUTFILE_PATH)
108
bin/tsne_clustering_plot.py
File was created 1 '''
2 Take one file with clustering
3 Take an other file with tsne
4 and then plot them
5 '''
6
File was created 1 '''
2 The goal of this script is to display calculate tsne of pvectors.
3 '''
4
5 import os
6 import argparse
7 import numpy as np
8 from sklearn.manifold import TSNE
9
10 # Defining argparse
11 parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d')
12 parser.add_argument('filepath', type=str,
13 help='the path of the file you want to calculate tsne')
14 parser.add_argument('-o', '--output', type=str,
15 default='.',
16 help='the path of the output file.')
17 parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3],
18 default='2',
19 help='number of components output of tsne')
20 parser.add_argument('-t', '--toy', action='store_true',
21 help='test the script on a toy example. Do not test all the file content.')
22 args = parser.parse_args()
23
24 # Editing global variable
25 FILE_PATH=args.filepath
26 OUTFILE_PATH=args.output
27 TOY_VERSION=args.toy
28 N_COMP=args.n_comp
29
30 # Defining pvectors with default number of column
31 pvectors = np.empty((0, 64), np.float32)
32 metas = np.empty((0, 4), np.float32)
33
34
35 # READ DATA
36 with open(os.path.join(FILE_PATH), "r") as f:
37 for i, line in enumerate(f):
38 if TOY_VERSION == True and i > 100:
39 break
40 spl_line = line.split(" ")
41 if(len(pvectors) == 0):
42 pvectors = np.empty((0, len(spl_line[1:])), np.float32)
43 metas = np.append(
44 metas,
45 np.asarray([spl_line[0].split(",")]),
46 axis=0)
47 pvectors = np.append(
48 pvectors,
49 np.asarray([spl_line[1:]], dtype=np.float32),
50 axis=0)
51
52
53
54 # PREPARE SAVE FILE FUNCTION
55 def save_file(filepath, metas, values):
56 with open(filepath, "w") as f:
57 for i, value in enumerate(values):
58 metas_str = ",".join(str(v) for v in metas[i])
59 try:
60 infos_str = " ".join(str(v) for v in values[i])
61 except TypeError as te:
62 infos_str = str(values[i])
63 f.write(metas_str + " " + infos_str + "\n")
64
65 # CALCULATE T-SNE
66 X_embedded = TSNE(n_components=N_COMP).fit_transform(pvectors)
67 save_file(OUTFILE_PATH, metas, X_embedded)
68