diff --git a/bin/__pycache__/data.cpython-36.pyc b/bin/__pycache__/data.cpython-36.pyc new file mode 100644 index 0000000000000000000000000000000000000000..61aebf07b057b3395d30049ac8b7005b7d579e1c GIT binary patch literal 1919 zcma)7&2Aev5GMCWlGm~8x@`g%ExZxzhC)jIGKI8>*>Ni~3b#f{t{td|)^803Hzg}BQ{`#}?>TQd$ zf7p!@V*D7@-AAL?fGMtmj1Ra9RrH1p0u`$k`cSo12YsaZQ9R;mMI~?eprwMJSa;@OJ2-uGZa?4sZL8F1@&fRQZXv* zM3$l)qe-AOFG}G`XpvU+gf|4~>3A-T7#CBGo!aSSG|SV{6uFqlV=selsYEgImV}nh zXxmGY=Y_>Zt;Q7_OLRsa6f-iZg_5Pb=Z_{d-Fx=r=?~8u71In#bT7FCRBSip)S_Ph$7vR985VHF+$UKL&hM}dki_#&zz6>l-jTA0O{ z0p?$VEw*S?{3xvAsx=BOIAUX-6SL1{j5l3HdMwP_ow<$x|!=|Adk50Pflhg%Zjp}W;0i6 z3y13;n7mI$deeEmLJlz`<7-gnHqfwE#BcG0Z}N5YUvT>=WNAPvC1Hi-29(#Dc zSp-$!;USm^9)gMBis96pATn^{D)#W82XN6gJ0VcWxx@3n{~stuH4gdxdr%n#jB;mO z=_nQXhf)41PYY3+iFS=oALvv9>jvSme2u(;4aa)!dg4Xi#0>>NCiBCYJX}TsB{Y&2 zXh27<|Jw+=pobQn&P$!ANEZ|?o}RAv>#A>% zi*t_HW9=(~h@=kR;C2hrlq^x9{+gn%Fe#Chj(Ei`kcaLt6=2LS_#Jqn3V#haQxU+A z&wux~-a#0NDdMO}jTECjv0I2-BU8XVo0L5l(x)EeR(h%r-3PvVFU2CuX5ho`xr*xt zVmT78@ewF&vj{0=G8mxG1`>61$dbQVipMUmBigdt4eHq6ua1t81G?c2=U%7XA9#4; z6%Oek53%fgpm!|!JMvO{I8;SC9M++mJD&->zk5XMmMay$5q|FtwcbDzJ=@La@6ifr QPS8X_C+KW+);pX30x&P^(*OVf literal 0 HcmV?d00001 diff --git a/bin/cluster_kmeans.py b/bin/cluster_kmeans.py new file mode 100644 index 0000000..63e951a --- /dev/null +++ b/bin/cluster_kmeans.py @@ -0,0 +1,42 @@ +''' +This script aims in computing k-means for a given +data set. +''' + +import argparse +import numpy as np +from sklearn.cluster import KMeans +from os import path + +import pickle +from data import read_file, index_by_id + +# -- ARGPARSE -- +parser = argparse.ArgumentParser(description="Cluster with kmeans") +parser.add_argument("features", type=str, help="Features file") +parser.add_argument("list", type=str, help="List on which apply kmeans") +parser.add_argument("outdir", type=str, help="Output directory for k-means models") +parser.add_argument("--kmin", type=int, help="minimum k", default=2) +parser.add_argument("--kmax", type=int, help="maximum k", default=100) + +args = vars(parser.parse_args()) +FEATURES = args["features"] +LST = args["list"] +OUTDIR = args["outdir"] +KMIN = args["kmin"] +KMAX = args["kmax"] + +# -- READE FILES -- +features = read_file(FEATURES) +feat_ind = index_by_id(features) + +lst = read_file(LST) + +# -- TRANSFORM INTO NUMPY -- +X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst]) + +Ks = range(KMIN, KMAX+1) +for k in Ks: + kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X) + pickle.dump(kmeans, open(path.join(OUTDIR, "clustering_" + str(k) + ".pkl"), "wb")) + diff --git a/bin/clustering_pvector.py b/bin/clustering_pvector.py new file mode 100644 index 0000000..6004c16 --- /dev/null +++ b/bin/clustering_pvector.py @@ -0,0 +1,205 @@ +''' +The goal of this script is to apply a clustering to pvector in order to find new classes assigned for each utterance or frame. +This new class can be used for new training systems, replacing character classes for example by the calculatering classes from clustering. +We hope this will generate interesting classes that will help the system to understand the structure of the voices. + +TODO: Change it in such a way as to take a number (1, 2, 3, 4) and calculate everything needed like clustering. Train on the train set and then project the test on this clustering in order to know to what cluster it belong to. +''' + +import os +import numpy as np +from sklearn.cluster import KMeans +import matplotlib.pyplot as plt +import argparse +import pandas as pd +import pickle + + +''' +Return data in panda format version +''' +def read_vecfile(filepath, toy_version=False): + vectors = "" + metas = "" + with open(filepath, "r") as f: + for i, line in enumerate(f): + if toy_version == True and i > 100: + break + spl_line = line.split(" ") + + if(len(vectors) == 0): + vectors = np.empty((0, len(spl_line[1:])), np.float32) + metas = np.empty((0, len(spl_line[0].split(",")))) + + # Then we add the current line to the data + metas = np.append( + metas, + np.asarray([spl_line[0].split(",")]), + axis=0) + + vectors = np.append( + vectors, + np.asarray([spl_line[1:]], dtype=np.float32), + axis=0) + return (metas, vectors) + +''' +Return list of metas of the listfile +''' +def read_lstfile(filepath, toy_version=False): + metas = np.empty((0, 4)) + with open(filepath, "r") as f: + for i, line in enumerate(f): + if toy_version == True and i > 100: + break + metas = np.append( + metas, + np.asarray([line.rstrip('\n').split(",")]), + axis=0) + return metas + +''' +Save a vector file from metas and vector values +''' +def save_file(filepath, metas, values=None): + with open(filepath, "w") as f: + for i in range(len(metas)): + metas_str = ",".join(str(v) for v in metas[i]) + if not values == None: + try: + infos_str = " ".join(str(v) for v in values[i]) + except TypeError as te: + infos_str = str(values[i]) + f.write(metas_str + " " + infos_str + "\n") + else: + f.write(metas_str + "\n") + +''' +Take the data and index them. +''' +def index_data(metas, vectors): + data = {} + data["en-us"] = {} + data["fr-fr"] = {} + for i, vector in enumerate(vectors): + meta = metas[i] + data[meta[0]][meta[3]] = {} + data[meta[0]][meta[3]]["metas"] = meta + data[meta[0]][meta[3]]["vector"] = vector + return data + + + +''' +Récupère un sous ensemble des données de base à partir d'une +liste. +''' +def get_subdata(data, lst): + metas = "" + vectors = "" + for meta in lst: + vector = data[meta[0]][meta[3]]["vector"] + if(len(metas) == 0): + metas = np.empty((0, len(meta))) + vectors = np.empty((0, len(vector)), np.float64) + metas = np.append( + metas, + np.asarray([data[meta[0]][meta[3]]["metas"]]), + axis=0) + vectors = np.append( + vectors, + np.asarray([vector]), + axis=0) + return metas, vectors + + +''' +Apply clustering on data of filename. +Use a list to determine train et test using train valid, test. +Save the file with the given suffix. +Check the existence of the files before calculating and saving, +if the two files already exist, it will not calculate it again. + +However: if one file is not present, this function will calculate +it again. + +TODO: Add a variable to force the calculation of all the files +even if they exist. +''' +def apply_clustering(filename, dir_lst, dir_data, suffix_outfile): + + # Applicate it for normal version + metas, vectors = read_vecfile(os.path.join(dir_data, filename), toy_version=False) + data = index_data(metas, vectors) + + ### CURSOR + # Get Train + train_lst = read_lstfile(os.path.join(dir_lst, "train_" + str(NUMBER) + ".lst")) + train_metas, train_vectors = get_subdata(data, train_lst) + + # Get Val + val_lst = read_lstfile(os.path.join(dir_lst, "val_" + str(NUMBER) + ".lst")) + val_metas, val_vectors = get_subdata(data, val_lst) + + # Get Test + test_lst = read_lstfile(os.path.join(dir_lst, "test_" + str(NUMBER) + ".lst")) + test_metas, test_vectors = get_subdata(data, test_lst) + + # Verif shapes + print("verif shapes") + print(train_metas.shape) + print(val_metas.shape) + print(test_metas.shape) + + # Entrainer le k-means sur le train + val + #Ks = [12, 24, 48] + + print("k=[", end="") + Ks = [6,12,24,48,64] + for k in Ks: + # Process the name + suffix = "_" + suffix_outfile if not suffix_outfile == "" else "" + k_str = "{:03d}".format(k) # K in string + filename_pickle = os.path.join( + DIR_DATA, + "clusters_trained_on_train_" +str(k_str)+ "_pickle_" + suffix + ".txt") + filename_clusters = os.path.join( + DIR_DATA, + "clusters_trained_on_train_" +str(k_str)+ suffix + ".txt") + + # Check if on of the two file does not exist + condition = not( + os.path.exists(filename_pickle) + and os.path.exists(filename_clusters) + ) + + if condition: + print(str(k)+",", end=" ") + kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit( + train_vectors) + test_pred = kmeans.predict(np.concatenate((val_vectors, test_vectors), axis=0)) + metas_tosave = np.concatenate([train_metas, val_metas, test_metas], axis=0) + values_tosave = np.concatenate([kmeans.labels_, test_pred], axis=0) + metas_tosave[:, 1] = values_tosave # Replace char by clusters + save_file(filename_clusters, metas_tosave) + pickle.dump(kmeans, open( filename_pickle, "wb" ) ) + print("]") + +for NUMBER in range(1, 5): + print("JACKKNIFING NUMBER: " + str(NUMBER)) + DIR_MAIN="exp/pvector-1" + DIR_DATA=os.path.join(DIR_MAIN, str(NUMBER)) + DIR_LST=os.path.join(DIR_MAIN, "lst") + OUTFILE_NAME="clustering" + + print("Calculating mass_effect_pvectors") + apply_clustering("masseffect_pvectors.txt", + dir_lst = os.path.join(DIR_MAIN, "lst"), + dir_data = DIR_DATA, + suffix_outfile = "") + + print("Calculating mass_effect_pvectors_final") + apply_clustering("masseffect_pvectors_final.txt", + dir_lst = os.path.join(DIR_MAIN, "lst"), + dir_data = DIR_DATA, + suffix_outfile = "final") diff --git a/bin/data.py b/bin/data.py new file mode 100644 index 0000000..fb25050 --- /dev/null +++ b/bin/data.py @@ -0,0 +1,69 @@ +''' +This module aim in loading and writing files. +Our files respect a specific format that +is not standard. This is why i hope these +function make the read of file easier. + +For more information about the data, read +the README file please. +''' + +import sys + +def read_file(filepath): + ''' + Read the file and return an array with pairs + where each pair is composed by the metas and the + features. + ''' + data = [] + with open(filepath, "r") as f: + for line in f: + splited = line.replace("\n", "").split(" ") + metas = splited[0].split(",") + features = splited[1:] + data.append((metas, features)) + return data + + +def index_by(data, num_col): + ''' + Allows the user to index data by number of columns. + ''' + indexed = {} + for line in data: + metas = line[0] + features = line[1] + if metas[num_col] not in indexed: + indexed[metas[num_col]] = [] + indexed[metas[num_col]].append((metas, features)) + return indexed + + +def index_by_id(data): + ''' + Allows the user to index data by id. + Index data by id consists in indexing two times + because data have two keys. On with the language + and the other one with the id of the sentence. + ''' + indexed = {} + for line in data: + metas = line[0] + id_sen = metas[3] + lang = metas[0] + if lang not in indexed: + indexed[lang] = {} + indexed[lang][id_sen] = line + return indexed + + +def write_line(metas, features, f=sys.stdout): + ''' + Just print the line. No need to specify a file. + + metas: meta information on list + features: feature vector + f: file to write it + ''' + print(",".join(metas) + " " + " ".join(features), file=f) diff --git a/bin/extract_kmeans.py b/bin/extract_kmeans.py new file mode 100644 index 0000000..2a95713 --- /dev/null +++ b/bin/extract_kmeans.py @@ -0,0 +1,54 @@ +''' +This script aims to extract k-means clustering from a +a priori trained k-means. +''' + +import argparse +import numpy as np +import pickle +from data import read_file, index_by_id, write_line +import sys + +# -- ARGPARSE -- +parser = argparse.ArgumentParser(description="extract clusters") +parser.add_argument("model", type=str, help="k-means model pickle") +parser.add_argument("features", type=str, help="features") +parser.add_argument("list", type=str, help="list file") +parser.add_argument("--outfile", type=str, default=None, help="output file std") + +args = vars(parser.parse_args()) +MODEL = args["model"] +FEATURES = args["features"] +LST = args["list"] +OUTFILE = args["outfile"] + +if OUTFILE == None: + OUTFILE = sys.stdout +else: + OUTFILE = open(OUTFILE, "w") + +# -- READ FILE -- +features = read_file(FEATURES) +feat_ind = index_by_id(features) + +lst = read_file(LST) + +kmeans = pickle.load(open(MODEL, "rb")) + + +# -- CONVERT TO NUMPY -- +X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst]) +predictions = kmeans.predict(X) + +for i, line in enumerate(lst): + meta = line[0] + meta[1] = str(predictions[i]) + write_line( + meta, + feat_ind[meta[0]][meta[3]][1], + OUTFILE + ) + +# -- CLOSE OUT FILE IF NECESSARY -- +if not OUTFILE == sys.stdout: + OUTFILE.close() \ No newline at end of file diff --git a/bin/extract_vectors.py b/bin/extract_vectors.py new file mode 100644 index 0000000..babe09f --- /dev/null +++ b/bin/extract_vectors.py @@ -0,0 +1,52 @@ +''' +The goal of this script is to extract vectors from a list. +One file is the full content, and the list only enumerate the +vectors you want to keep. +''' + +import os +import numpy as np +import argparse + +parser = argparse.ArgumentParser(description='Extract a subset of vectors') +parser.add_argument('vectorsfile', type=str, + help='the path of the file containing the convectors') +parser.add_argument('listfile', type=str, + help='the path of the file containing the list of vectors kept') +parser.add_argument('-o', '--output', type=str, + default='a.out', + help='the path the output file containing the vectors kept') + +args = parser.parse_args() + +# Editing global variable +VECTOR_FILE = args.vectorsfile +LIST_FILE = args.listfile +OUTPUT_FILE = args.output + +# READ VECTOR DATA +data = {} +data["en-us"] = {} +data["fr-fr"] = {} +with open(VECTOR_FILE, "r") as f: + for i, line in enumerate(f): + if TOY_VERSION == True and i > 100: + break + spl_line = line.split(" ") + if(len(pvectors) == 0): + pvectors = np.empty((0, len(spl_line[1:])), np.float32) + spl_meta = spl_line.split(",") + lang = spl_meta[0] + iden = spl_meta[3] + data[lang][iden] = line + +# READ LIST AND WRITE NEW FILE +with open(LIST_FILE, "r") as f, open(OUTPUT_FILE, "w") as o: + for i, line in enumerate(LIST_FILE): + if TOY_VERSION == True and i > 100: + break + spl_meta = line.split(",") + lang = spl_meta[0] + iden = spl_meta[3] + OUTPUT_FILE.write(data[lang][iden]) + diff --git a/bin/plot.py b/bin/plot.py new file mode 100644 index 0000000..511476b --- /dev/null +++ b/bin/plot.py @@ -0,0 +1,70 @@ +''' +Take a file and plot its data onto a 2d or 3d axis depending on the data. +''' + +import os +import numpy as np +from sklearn.cluster import KMeans +import matplotlib.pyplot as plt +import argparse +import json + +# Defining argparse +parser = argparse.ArgumentParser(prog='Plotter', description='Plot a file of 2d ou 3d dimension') +parser.add_argument('filepath', type=str, + help='the path of the file you want to plot') +parser.add_argument('-o-', '--output', type=str, + default='plot.pdf', + help='the path of the ploted file') +parser.add_argument('-t', '--toy', action='store_true', + help='test the script on a toy example. Do not test all the file content') + +args = parser.parse_args() + +# Editing global variable +FILE_PATH=args.filepath +OUTFILE_PATH = args.output +TOY_VERSION = args.toy + +# Defining vectors with default number of column +vectors = np.empty((0, 64), np.float32) +metas = np.empty((0, 4), np.float32) + +# READ DATA +with open(os.path.join(FILE_PATH), "r") as f: + for i, line in enumerate(f): + if TOY_VERSION == True and i > 100: + break + spl_line = line.split(" ") + if(len(vectors) == 0): + vectors = np.empty((0, len(spl_line[1:])), np.float32) + metas = np.append( + metas, + np.asarray([spl_line[0].split(",")]), + axis=0) + + vectors = np.append( + vectors, + np.asarray([spl_line[1:]], dtype=np.float32), + axis=0) + +vectors_T = np.transpose(vectors) + + +# Plot the file +plt.plot(vectors, 'ro') +fig, ax = plt.subplots() + +if(vectors_T.shape[0] == 2): + ax.scatter(vectors_T[0], vectors_T[1]) #c=close, s=volume, alpha=0.5) +else: + ax.scatter(vectors_T[0], vectors_T[1], vectors_T[2]) + +ax.set_xlabel('Axe 1', fontsize=15) +ax.set_ylabel('Axe 2', fontsize=15) + +if(vectors_T.shape[0] == 3): + ax.set_zlabel('Axe 3', fontsize15=15) + +ax.set_title('Volume and percent change') +plt.savefig(OUTFILE_PATH) diff --git a/bin/plot_character.py b/bin/plot_character.py new file mode 100644 index 0000000..aea0577 --- /dev/null +++ b/bin/plot_character.py @@ -0,0 +1,80 @@ +''' +Take a file and plot its data onto a 2d or 3d axis depending on the data. +Automatic detection of the number of dimension. +''' + +import os +import numpy as np +from sklearn.cluster import KMeans +import matplotlib.pyplot as plt +import argparse +import json +import pandas as pd + +# Defining useful functions + +''' +Read the file whose content is metas and vectors. +Returns two numpy array : (metas, vectors) + +''' +def read_vector_file(filename, toy_version=False): + vectors = np.empty((0, 1), np.float32) + metas = np.empty((0, 4), np.float32) + with open(filename, "r") as f: + for i, line in enumerate(f): + if toy_version == True and i > 100: + break + spl_line = line.split(" ") + if(len(vectors) == 0): + vectors = np.empty((0, len(spl_line[1:])), np.float32) + metas = np.append( + metas, + np.asarray([spl_line[0].split(",")]), + axis=0) + + vectors = np.append( + vectors, + np.asarray([spl_line[1:]], dtype=np.float32), + axis=0) + return (metas, vectors) + + +# Defining argparse +parser = argparse.ArgumentParser(description='Plot a file of 2d ou 3d dimension') +parser.add_argument('vectorfile', type=str, + help='the path of the vectors file') +parser.add_argument('-o-', '--output', type=str, + default='plot.pdf', + help='the path of the ploted file') +parser.add_argument('-t', '--toy', action='store_true', + help='test the script on a toy example. Do not test all the file content') + +args = parser.parse_args() + +# Editing global variable +VECTORFILE_PATH=args.vectorfile +OUTFILE_PATH = args.output +TOY_VERSION = args.toy + + +# Get Vectors +metas, vectors = read_vector_file(VECTORFILE_PATH, toy_version = TOY_VERSION) +vectors_T = np.transpose(vectors) + +print("Number of characters: " + str(len(np.unique(np.transpose(metas)[1])))) +df = pd.DataFrame(dict( + x=vectors_T[0], + y=vectors_T[1], + character=np.transpose(metas)[1] + )) + +groups = df.groupby('character') + +# Plot +fig, ax = plt.subplots() + +for character, group in groups: + ax.plot(group.x, group.y, marker='o', linestyle='', ms=2, label=character) +plt.savefig(OUTFILE_PATH) +print("Your plot is saved well (no check of this affirmation)") diff --git a/bin/plot_clusters.py b/bin/plot_clusters.py new file mode 100644 index 0000000..8542e2f --- /dev/null +++ b/bin/plot_clusters.py @@ -0,0 +1,107 @@ +''' +Take a file and plot its data onto a 2d or 3d axis depending on the data. +''' + +import os +import numpy as np +from sklearn.cluster import KMeans +import matplotlib.pyplot as plt +import argparse +import json +import pandas as pd + +# Defining useful functions + +''' +Read the file whose content is metas and vectors. +Returns two numpy array : (metas, vectors) + +''' +def read_vector_file(filename, toy_version=False): + vectors = np.empty((0, 1), np.float32) + metas = np.empty((0, 4), np.float32) + with open(filename, "r") as f: + for i, line in enumerate(f): + if toy_version == True and i > 100: + break + spl_line = line.split(" ") + if(len(vectors) == 0): + vectors = np.empty((0, len(spl_line[1:])), np.float32) + metas = np.append( + metas, + np.asarray([spl_line[0].split(",")]), + axis=0) + + vectors = np.append( + vectors, + np.asarray([spl_line[1:]], dtype=np.float32), + axis=0) + return (metas, vectors) + + +''' +Check if the two given files have the same order. +''' +def check_files(vector_file, cluster_file): + with open(vector_file, "r") as f1, open(cluster_file, "r") as f2: + for line1, line2 in zip(f1, f2): + line1_str = line1.strip() + line2_str = line2.strip() + metas1 = line1_str.split(" ")[0].split(",") + metas2 = line2_str.split(" ")[0].split(",") + if(not metas1[0] == metas2[0] or not metas1[3] == metas2[3]): + return False + return True + + + + + +# Defining argparse +parser = argparse.ArgumentParser(prog='Plotter', description='Plot a file of 2d ou 3d dimension') +parser.add_argument('clusterfile', type=str, + help='the path of the cluster file') +parser.add_argument('vectorfile', type=str, + help='the path of the vectors file') +parser.add_argument('-o-', '--output', type=str, + default='plot.pdf', + help='the path of the ploted file') +parser.add_argument('-t', '--toy', action='store_true', + help='test the script on a toy example. Do not test all the file content') + +args = parser.parse_args() + +# Editing global variable +CLUSTERFILE_PATH=args.clusterfile +VECTORFILE_PATH=args.vectorfile +OUTFILE_PATH = args.output +TOY_VERSION = args.toy + +if check_files(VECTORFILE_PATH, CLUSTERFILE_PATH) == False: + print("Les fichiers ne sont pas dans le meme ordre. Dans une version futur, cela générera une exception. On stop le processus.") + exit(1) + +# Get Vectors +metas, vectors = read_vector_file(VECTORFILE_PATH, toy_version = TOY_VERSION) +vectors_T = np.transpose(vectors) + +# Get Clusters +metas, clusters = read_vector_file(CLUSTERFILE_PATH, toy_version = TOY_VERSION) + +#print(np.transpose(clusters)[0]) +#print(np.transpose(metas)[0]) +df = pd.DataFrame(dict( + x=vectors_T[0], + y=vectors_T[1], + cluster=np.transpose(clusters)[0] + )) + +groups = df.groupby('cluster') + +# Plot +fig, ax = plt.subplots() + +for cluster, group in groups: + ax.plot(group.x, group.y, marker='o', linestyle='', ms=2, label=cluster) +ax.legend() +plt.savefig(OUTFILE_PATH) diff --git a/bin/tsne_clustering_plot.py b/bin/tsne_clustering_plot.py new file mode 100644 index 0000000..7894fea --- /dev/null +++ b/bin/tsne_clustering_plot.py @@ -0,0 +1,5 @@ +''' +Take one file with clustering +Take an other file with tsne +and then plot them +''' diff --git a/bin/tsne_pvector.py b/bin/tsne_pvector.py new file mode 100644 index 0000000..96987e5 --- /dev/null +++ b/bin/tsne_pvector.py @@ -0,0 +1,67 @@ +''' +The goal of this script is to display calculate tsne of pvectors. +''' + +import os +import argparse +import numpy as np +from sklearn.manifold import TSNE + +# Defining argparse +parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d') +parser.add_argument('filepath', type=str, + help='the path of the file you want to calculate tsne') +parser.add_argument('-o', '--output', type=str, + default='.', + help='the path of the output file.') +parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3], + default='2', + help='number of components output of tsne') +parser.add_argument('-t', '--toy', action='store_true', + help='test the script on a toy example. Do not test all the file content.') +args = parser.parse_args() + +# Editing global variable +FILE_PATH=args.filepath +OUTFILE_PATH=args.output +TOY_VERSION=args.toy +N_COMP=args.n_comp + +# Defining pvectors with default number of column +pvectors = np.empty((0, 64), np.float32) +metas = np.empty((0, 4), np.float32) + + +# READ DATA +with open(os.path.join(FILE_PATH), "r") as f: + for i, line in enumerate(f): + if TOY_VERSION == True and i > 100: + break + spl_line = line.split(" ") + if(len(pvectors) == 0): + pvectors = np.empty((0, len(spl_line[1:])), np.float32) + metas = np.append( + metas, + np.asarray([spl_line[0].split(",")]), + axis=0) + pvectors = np.append( + pvectors, + np.asarray([spl_line[1:]], dtype=np.float32), + axis=0) + + + +# PREPARE SAVE FILE FUNCTION +def save_file(filepath, metas, values): + with open(filepath, "w") as f: + for i, value in enumerate(values): + metas_str = ",".join(str(v) for v in metas[i]) + try: + infos_str = " ".join(str(v) for v in values[i]) + except TypeError as te: + infos_str = str(values[i]) + f.write(metas_str + " " + infos_str + "\n") + +# CALCULATE T-SNE +X_embedded = TSNE(n_components=N_COMP).fit_transform(pvectors) +save_file(OUTFILE_PATH, metas, X_embedded)