Commit ac78b07ea0ab18b7855f1b752e90fcac99440c98

Authored by Mathias Quillot
1 parent b8acebc1ed
Exists in master

All base bin files added

Showing 11 changed files with 751 additions and 0 deletions Side-by-side Diff

bin/__pycache__/data.cpython-36.pyc
No preview for this file type
bin/cluster_kmeans.py
  1 +'''
  2 +This script aims in computing k-means for a given
  3 +data set.
  4 +'''
  5 +
  6 +import argparse
  7 +import numpy as np
  8 +from sklearn.cluster import KMeans
  9 +from os import path
  10 +
  11 +import pickle
  12 +from data import read_file, index_by_id
  13 +
  14 +# -- ARGPARSE --
  15 +parser = argparse.ArgumentParser(description="Cluster with kmeans")
  16 +parser.add_argument("features", type=str, help="Features file")
  17 +parser.add_argument("list", type=str, help="List on which apply kmeans")
  18 +parser.add_argument("outdir", type=str, help="Output directory for k-means models")
  19 +parser.add_argument("--kmin", type=int, help="minimum k", default=2)
  20 +parser.add_argument("--kmax", type=int, help="maximum k", default=100)
  21 +
  22 +args = vars(parser.parse_args())
  23 +FEATURES = args["features"]
  24 +LST = args["list"]
  25 +OUTDIR = args["outdir"]
  26 +KMIN = args["kmin"]
  27 +KMAX = args["kmax"]
  28 +
  29 +# -- READE FILES --
  30 +features = read_file(FEATURES)
  31 +feat_ind = index_by_id(features)
  32 +
  33 +lst = read_file(LST)
  34 +
  35 +# -- TRANSFORM INTO NUMPY --
  36 +X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst])
  37 +
  38 +Ks = range(KMIN, KMAX+1)
  39 +for k in Ks:
  40 + kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
  41 + pickle.dump(kmeans, open(path.join(OUTDIR, "clustering_" + str(k) + ".pkl"), "wb"))
bin/clustering_pvector.py
  1 +'''
  2 +The goal of this script is to apply a clustering to pvector in order to find new classes assigned for each utterance or frame.
  3 +This new class can be used for new training systems, replacing character classes for example by the calculatering classes from clustering.
  4 +We hope this will generate interesting classes that will help the system to understand the structure of the voices.
  5 +
  6 +TODO: Change it in such a way as to take a number (1, 2, 3, 4) and calculate everything needed like clustering. Train on the train set and then project the test on this clustering in order to know to what cluster it belong to.
  7 +'''
  8 +
  9 +import os
  10 +import numpy as np
  11 +from sklearn.cluster import KMeans
  12 +import matplotlib.pyplot as plt
  13 +import argparse
  14 +import pandas as pd
  15 +import pickle
  16 +
  17 +
  18 +'''
  19 +Return data in panda format version
  20 +'''
  21 +def read_vecfile(filepath, toy_version=False):
  22 + vectors = ""
  23 + metas = ""
  24 + with open(filepath, "r") as f:
  25 + for i, line in enumerate(f):
  26 + if toy_version == True and i > 100:
  27 + break
  28 + spl_line = line.split(" ")
  29 +
  30 + if(len(vectors) == 0):
  31 + vectors = np.empty((0, len(spl_line[1:])), np.float32)
  32 + metas = np.empty((0, len(spl_line[0].split(","))))
  33 +
  34 + # Then we add the current line to the data
  35 + metas = np.append(
  36 + metas,
  37 + np.asarray([spl_line[0].split(",")]),
  38 + axis=0)
  39 +
  40 + vectors = np.append(
  41 + vectors,
  42 + np.asarray([spl_line[1:]], dtype=np.float32),
  43 + axis=0)
  44 + return (metas, vectors)
  45 +
  46 +'''
  47 +Return list of metas of the listfile
  48 +'''
  49 +def read_lstfile(filepath, toy_version=False):
  50 + metas = np.empty((0, 4))
  51 + with open(filepath, "r") as f:
  52 + for i, line in enumerate(f):
  53 + if toy_version == True and i > 100:
  54 + break
  55 + metas = np.append(
  56 + metas,
  57 + np.asarray([line.rstrip('\n').split(",")]),
  58 + axis=0)
  59 + return metas
  60 +
  61 +'''
  62 +Save a vector file from metas and vector values
  63 +'''
  64 +def save_file(filepath, metas, values=None):
  65 + with open(filepath, "w") as f:
  66 + for i in range(len(metas)):
  67 + metas_str = ",".join(str(v) for v in metas[i])
  68 + if not values == None:
  69 + try:
  70 + infos_str = " ".join(str(v) for v in values[i])
  71 + except TypeError as te:
  72 + infos_str = str(values[i])
  73 + f.write(metas_str + " " + infos_str + "\n")
  74 + else:
  75 + f.write(metas_str + "\n")
  76 +
  77 +'''
  78 +Take the data and index them.
  79 +'''
  80 +def index_data(metas, vectors):
  81 + data = {}
  82 + data["en-us"] = {}
  83 + data["fr-fr"] = {}
  84 + for i, vector in enumerate(vectors):
  85 + meta = metas[i]
  86 + data[meta[0]][meta[3]] = {}
  87 + data[meta[0]][meta[3]]["metas"] = meta
  88 + data[meta[0]][meta[3]]["vector"] = vector
  89 + return data
  90 +
  91 +
  92 +
  93 +'''
  94 +Rรฉcupรจre un sous ensemble des donnรฉes de base ร  partir d'une
  95 +liste.
  96 +'''
  97 +def get_subdata(data, lst):
  98 + metas = ""
  99 + vectors = ""
  100 + for meta in lst:
  101 + vector = data[meta[0]][meta[3]]["vector"]
  102 + if(len(metas) == 0):
  103 + metas = np.empty((0, len(meta)))
  104 + vectors = np.empty((0, len(vector)), np.float64)
  105 + metas = np.append(
  106 + metas,
  107 + np.asarray([data[meta[0]][meta[3]]["metas"]]),
  108 + axis=0)
  109 + vectors = np.append(
  110 + vectors,
  111 + np.asarray([vector]),
  112 + axis=0)
  113 + return metas, vectors
  114 +
  115 +
  116 +'''
  117 +Apply clustering on data of filename.
  118 +Use a list to determine train et test using train valid, test.
  119 +Save the file with the given suffix.
  120 +Check the existence of the files before calculating and saving,
  121 +if the two files already exist, it will not calculate it again.
  122 +
  123 +However: if one file is not present, this function will calculate
  124 +it again.
  125 +
  126 +TODO: Add a variable to force the calculation of all the files
  127 +even if they exist.
  128 +'''
  129 +def apply_clustering(filename, dir_lst, dir_data, suffix_outfile):
  130 +
  131 + # Applicate it for normal version
  132 + metas, vectors = read_vecfile(os.path.join(dir_data, filename), toy_version=False)
  133 + data = index_data(metas, vectors)
  134 +
  135 + ### CURSOR
  136 + # Get Train
  137 + train_lst = read_lstfile(os.path.join(dir_lst, "train_" + str(NUMBER) + ".lst"))
  138 + train_metas, train_vectors = get_subdata(data, train_lst)
  139 +
  140 + # Get Val
  141 + val_lst = read_lstfile(os.path.join(dir_lst, "val_" + str(NUMBER) + ".lst"))
  142 + val_metas, val_vectors = get_subdata(data, val_lst)
  143 +
  144 + # Get Test
  145 + test_lst = read_lstfile(os.path.join(dir_lst, "test_" + str(NUMBER) + ".lst"))
  146 + test_metas, test_vectors = get_subdata(data, test_lst)
  147 +
  148 + # Verif shapes
  149 + print("verif shapes")
  150 + print(train_metas.shape)
  151 + print(val_metas.shape)
  152 + print(test_metas.shape)
  153 +
  154 + # Entrainer le k-means sur le train + val
  155 + #Ks = [12, 24, 48]
  156 +
  157 + print("k=[", end="")
  158 + Ks = [6,12,24,48,64]
  159 + for k in Ks:
  160 + # Process the name
  161 + suffix = "_" + suffix_outfile if not suffix_outfile == "" else ""
  162 + k_str = "{:03d}".format(k) # K in string
  163 + filename_pickle = os.path.join(
  164 + DIR_DATA,
  165 + "clusters_trained_on_train_" +str(k_str)+ "_pickle_" + suffix + ".txt")
  166 + filename_clusters = os.path.join(
  167 + DIR_DATA,
  168 + "clusters_trained_on_train_" +str(k_str)+ suffix + ".txt")
  169 +
  170 + # Check if on of the two file does not exist
  171 + condition = not(
  172 + os.path.exists(filename_pickle)
  173 + and os.path.exists(filename_clusters)
  174 + )
  175 +
  176 + if condition:
  177 + print(str(k)+",", end=" ")
  178 + kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(
  179 + train_vectors)
  180 + test_pred = kmeans.predict(np.concatenate((val_vectors, test_vectors), axis=0))
  181 + metas_tosave = np.concatenate([train_metas, val_metas, test_metas], axis=0)
  182 + values_tosave = np.concatenate([kmeans.labels_, test_pred], axis=0)
  183 + metas_tosave[:, 1] = values_tosave # Replace char by clusters
  184 + save_file(filename_clusters, metas_tosave)
  185 + pickle.dump(kmeans, open( filename_pickle, "wb" ) )
  186 + print("]")
  187 +
  188 +for NUMBER in range(1, 5):
  189 + print("JACKKNIFING NUMBER: " + str(NUMBER))
  190 + DIR_MAIN="exp/pvector-1"
  191 + DIR_DATA=os.path.join(DIR_MAIN, str(NUMBER))
  192 + DIR_LST=os.path.join(DIR_MAIN, "lst")
  193 + OUTFILE_NAME="clustering"
  194 +
  195 + print("Calculating mass_effect_pvectors")
  196 + apply_clustering("masseffect_pvectors.txt",
  197 + dir_lst = os.path.join(DIR_MAIN, "lst"),
  198 + dir_data = DIR_DATA,
  199 + suffix_outfile = "")
  200 +
  201 + print("Calculating mass_effect_pvectors_final")
  202 + apply_clustering("masseffect_pvectors_final.txt",
  203 + dir_lst = os.path.join(DIR_MAIN, "lst"),
  204 + dir_data = DIR_DATA,
  205 + suffix_outfile = "final")
  1 +'''
  2 +This module aim in loading and writing files.
  3 +Our files respect a specific format that
  4 +is not standard. This is why i hope these
  5 +function make the read of file easier.
  6 +
  7 +For more information about the data, read
  8 +the README file please.
  9 +'''
  10 +
  11 +import sys
  12 +
  13 +def read_file(filepath):
  14 + '''
  15 + Read the file and return an array with pairs
  16 + where each pair is composed by the metas and the
  17 + features.
  18 + '''
  19 + data = []
  20 + with open(filepath, "r") as f:
  21 + for line in f:
  22 + splited = line.replace("\n", "").split(" ")
  23 + metas = splited[0].split(",")
  24 + features = splited[1:]
  25 + data.append((metas, features))
  26 + return data
  27 +
  28 +
  29 +def index_by(data, num_col):
  30 + '''
  31 + Allows the user to index data by number of columns.
  32 + '''
  33 + indexed = {}
  34 + for line in data:
  35 + metas = line[0]
  36 + features = line[1]
  37 + if metas[num_col] not in indexed:
  38 + indexed[metas[num_col]] = []
  39 + indexed[metas[num_col]].append((metas, features))
  40 + return indexed
  41 +
  42 +
  43 +def index_by_id(data):
  44 + '''
  45 + Allows the user to index data by id.
  46 + Index data by id consists in indexing two times
  47 + because data have two keys. On with the language
  48 + and the other one with the id of the sentence.
  49 + '''
  50 + indexed = {}
  51 + for line in data:
  52 + metas = line[0]
  53 + id_sen = metas[3]
  54 + lang = metas[0]
  55 + if lang not in indexed:
  56 + indexed[lang] = {}
  57 + indexed[lang][id_sen] = line
  58 + return indexed
  59 +
  60 +
  61 +def write_line(metas, features, f=sys.stdout):
  62 + '''
  63 + Just print the line. No need to specify a file.
  64 +
  65 + metas: meta information on list
  66 + features: feature vector
  67 + f: file to write it
  68 + '''
  69 + print(",".join(metas) + " " + " ".join(features), file=f)
bin/extract_kmeans.py
  1 +'''
  2 +This script aims to extract k-means clustering from a
  3 +a priori trained k-means.
  4 +'''
  5 +
  6 +import argparse
  7 +import numpy as np
  8 +import pickle
  9 +from data import read_file, index_by_id, write_line
  10 +import sys
  11 +
  12 +# -- ARGPARSE --
  13 +parser = argparse.ArgumentParser(description="extract clusters")
  14 +parser.add_argument("model", type=str, help="k-means model pickle")
  15 +parser.add_argument("features", type=str, help="features")
  16 +parser.add_argument("list", type=str, help="list file")
  17 +parser.add_argument("--outfile", type=str, default=None, help="output file std")
  18 +
  19 +args = vars(parser.parse_args())
  20 +MODEL = args["model"]
  21 +FEATURES = args["features"]
  22 +LST = args["list"]
  23 +OUTFILE = args["outfile"]
  24 +
  25 +if OUTFILE == None:
  26 + OUTFILE = sys.stdout
  27 +else:
  28 + OUTFILE = open(OUTFILE, "w")
  29 +
  30 +# -- READ FILE --
  31 +features = read_file(FEATURES)
  32 +feat_ind = index_by_id(features)
  33 +
  34 +lst = read_file(LST)
  35 +
  36 +kmeans = pickle.load(open(MODEL, "rb"))
  37 +
  38 +
  39 +# -- CONVERT TO NUMPY --
  40 +X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst])
  41 +predictions = kmeans.predict(X)
  42 +
  43 +for i, line in enumerate(lst):
  44 + meta = line[0]
  45 + meta[1] = str(predictions[i])
  46 + write_line(
  47 + meta,
  48 + feat_ind[meta[0]][meta[3]][1],
  49 + OUTFILE
  50 + )
  51 +
  52 +# -- CLOSE OUT FILE IF NECESSARY --
  53 +if not OUTFILE == sys.stdout:
  54 + OUTFILE.close()
bin/extract_vectors.py
  1 +'''
  2 +The goal of this script is to extract vectors from a list.
  3 +One file is the full content, and the list only enumerate the
  4 +vectors you want to keep.
  5 +'''
  6 +
  7 +import os
  8 +import numpy as np
  9 +import argparse
  10 +
  11 +parser = argparse.ArgumentParser(description='Extract a subset of vectors')
  12 +parser.add_argument('vectorsfile', type=str,
  13 + help='the path of the file containing the convectors')
  14 +parser.add_argument('listfile', type=str,
  15 + help='the path of the file containing the list of vectors kept')
  16 +parser.add_argument('-o', '--output', type=str,
  17 + default='a.out',
  18 + help='the path the output file containing the vectors kept')
  19 +
  20 +args = parser.parse_args()
  21 +
  22 +# Editing global variable
  23 +VECTOR_FILE = args.vectorsfile
  24 +LIST_FILE = args.listfile
  25 +OUTPUT_FILE = args.output
  26 +
  27 +# READ VECTOR DATA
  28 +data = {}
  29 +data["en-us"] = {}
  30 +data["fr-fr"] = {}
  31 +with open(VECTOR_FILE, "r") as f:
  32 + for i, line in enumerate(f):
  33 + if TOY_VERSION == True and i > 100:
  34 + break
  35 + spl_line = line.split(" ")
  36 + if(len(pvectors) == 0):
  37 + pvectors = np.empty((0, len(spl_line[1:])), np.float32)
  38 + spl_meta = spl_line.split(",")
  39 + lang = spl_meta[0]
  40 + iden = spl_meta[3]
  41 + data[lang][iden] = line
  42 +
  43 +# READ LIST AND WRITE NEW FILE
  44 +with open(LIST_FILE, "r") as f, open(OUTPUT_FILE, "w") as o:
  45 + for i, line in enumerate(LIST_FILE):
  46 + if TOY_VERSION == True and i > 100:
  47 + break
  48 + spl_meta = line.split(",")
  49 + lang = spl_meta[0]
  50 + iden = spl_meta[3]
  51 + OUTPUT_FILE.write(data[lang][iden])
  1 +'''
  2 +Take a file and plot its data onto a 2d or 3d axis depending on the data.
  3 +'''
  4 +
  5 +import os
  6 +import numpy as np
  7 +from sklearn.cluster import KMeans
  8 +import matplotlib.pyplot as plt
  9 +import argparse
  10 +import json
  11 +
  12 +# Defining argparse
  13 +parser = argparse.ArgumentParser(prog='Plotter', description='Plot a file of 2d ou 3d dimension')
  14 +parser.add_argument('filepath', type=str,
  15 + help='the path of the file you want to plot')
  16 +parser.add_argument('-o-', '--output', type=str,
  17 + default='plot.pdf',
  18 + help='the path of the ploted file')
  19 +parser.add_argument('-t', '--toy', action='store_true',
  20 + help='test the script on a toy example. Do not test all the file content')
  21 +
  22 +args = parser.parse_args()
  23 +
  24 +# Editing global variable
  25 +FILE_PATH=args.filepath
  26 +OUTFILE_PATH = args.output
  27 +TOY_VERSION = args.toy
  28 +
  29 +# Defining vectors with default number of column
  30 +vectors = np.empty((0, 64), np.float32)
  31 +metas = np.empty((0, 4), np.float32)
  32 +
  33 +# READ DATA
  34 +with open(os.path.join(FILE_PATH), "r") as f:
  35 + for i, line in enumerate(f):
  36 + if TOY_VERSION == True and i > 100:
  37 + break
  38 + spl_line = line.split(" ")
  39 + if(len(vectors) == 0):
  40 + vectors = np.empty((0, len(spl_line[1:])), np.float32)
  41 + metas = np.append(
  42 + metas,
  43 + np.asarray([spl_line[0].split(",")]),
  44 + axis=0)
  45 +
  46 + vectors = np.append(
  47 + vectors,
  48 + np.asarray([spl_line[1:]], dtype=np.float32),
  49 + axis=0)
  50 +
  51 +vectors_T = np.transpose(vectors)
  52 +
  53 +
  54 +# Plot the file
  55 +plt.plot(vectors, 'ro')
  56 +fig, ax = plt.subplots()
  57 +
  58 +if(vectors_T.shape[0] == 2):
  59 + ax.scatter(vectors_T[0], vectors_T[1]) #c=close, s=volume, alpha=0.5)
  60 +else:
  61 + ax.scatter(vectors_T[0], vectors_T[1], vectors_T[2])
  62 +
  63 +ax.set_xlabel('Axe 1', fontsize=15)
  64 +ax.set_ylabel('Axe 2', fontsize=15)
  65 +
  66 +if(vectors_T.shape[0] == 3):
  67 + ax.set_zlabel('Axe 3', fontsize15=15)
  68 +
  69 +ax.set_title('Volume and percent change')
  70 +plt.savefig(OUTFILE_PATH)
bin/plot_character.py
  1 +'''
  2 +Take a file and plot its data onto a 2d or 3d axis depending on the data.
  3 +Automatic detection of the number of dimension.
  4 +'''
  5 +
  6 +import os
  7 +import numpy as np
  8 +from sklearn.cluster import KMeans
  9 +import matplotlib.pyplot as plt
  10 +import argparse
  11 +import json
  12 +import pandas as pd
  13 +
  14 +# Defining useful functions
  15 +
  16 +'''
  17 +Read the file whose content is metas and vectors.
  18 +Returns two numpy array : (metas, vectors)
  19 +
  20 +'''
  21 +def read_vector_file(filename, toy_version=False):
  22 + vectors = np.empty((0, 1), np.float32)
  23 + metas = np.empty((0, 4), np.float32)
  24 + with open(filename, "r") as f:
  25 + for i, line in enumerate(f):
  26 + if toy_version == True and i > 100:
  27 + break
  28 + spl_line = line.split(" ")
  29 + if(len(vectors) == 0):
  30 + vectors = np.empty((0, len(spl_line[1:])), np.float32)
  31 + metas = np.append(
  32 + metas,
  33 + np.asarray([spl_line[0].split(",")]),
  34 + axis=0)
  35 +
  36 + vectors = np.append(
  37 + vectors,
  38 + np.asarray([spl_line[1:]], dtype=np.float32),
  39 + axis=0)
  40 + return (metas, vectors)
  41 +
  42 +
  43 +# Defining argparse
  44 +parser = argparse.ArgumentParser(description='Plot a file of 2d ou 3d dimension')
  45 +parser.add_argument('vectorfile', type=str,
  46 + help='the path of the vectors file')
  47 +parser.add_argument('-o-', '--output', type=str,
  48 + default='plot.pdf',
  49 + help='the path of the ploted file')
  50 +parser.add_argument('-t', '--toy', action='store_true',
  51 + help='test the script on a toy example. Do not test all the file content')
  52 +
  53 +args = parser.parse_args()
  54 +
  55 +# Editing global variable
  56 +VECTORFILE_PATH=args.vectorfile
  57 +OUTFILE_PATH = args.output
  58 +TOY_VERSION = args.toy
  59 +
  60 +
  61 +# Get Vectors
  62 +metas, vectors = read_vector_file(VECTORFILE_PATH, toy_version = TOY_VERSION)
  63 +vectors_T = np.transpose(vectors)
  64 +
  65 +print("Number of characters: " + str(len(np.unique(np.transpose(metas)[1]))))
  66 +df = pd.DataFrame(dict(
  67 + x=vectors_T[0],
  68 + y=vectors_T[1],
  69 + character=np.transpose(metas)[1]
  70 + ))
  71 +
  72 +groups = df.groupby('character')
  73 +
  74 +# Plot
  75 +fig, ax = plt.subplots()
  76 +
  77 +for character, group in groups:
  78 + ax.plot(group.x, group.y, marker='o', linestyle='', ms=2, label=character)
  79 +plt.savefig(OUTFILE_PATH)
  80 +print("Your plot is saved well (no check of this affirmation)")
bin/plot_clusters.py
  1 +'''
  2 +Take a file and plot its data onto a 2d or 3d axis depending on the data.
  3 +'''
  4 +
  5 +import os
  6 +import numpy as np
  7 +from sklearn.cluster import KMeans
  8 +import matplotlib.pyplot as plt
  9 +import argparse
  10 +import json
  11 +import pandas as pd
  12 +
  13 +# Defining useful functions
  14 +
  15 +'''
  16 +Read the file whose content is metas and vectors.
  17 +Returns two numpy array : (metas, vectors)
  18 +
  19 +'''
  20 +def read_vector_file(filename, toy_version=False):
  21 + vectors = np.empty((0, 1), np.float32)
  22 + metas = np.empty((0, 4), np.float32)
  23 + with open(filename, "r") as f:
  24 + for i, line in enumerate(f):
  25 + if toy_version == True and i > 100:
  26 + break
  27 + spl_line = line.split(" ")
  28 + if(len(vectors) == 0):
  29 + vectors = np.empty((0, len(spl_line[1:])), np.float32)
  30 + metas = np.append(
  31 + metas,
  32 + np.asarray([spl_line[0].split(",")]),
  33 + axis=0)
  34 +
  35 + vectors = np.append(
  36 + vectors,
  37 + np.asarray([spl_line[1:]], dtype=np.float32),
  38 + axis=0)
  39 + return (metas, vectors)
  40 +
  41 +
  42 +'''
  43 +Check if the two given files have the same order.
  44 +'''
  45 +def check_files(vector_file, cluster_file):
  46 + with open(vector_file, "r") as f1, open(cluster_file, "r") as f2:
  47 + for line1, line2 in zip(f1, f2):
  48 + line1_str = line1.strip()
  49 + line2_str = line2.strip()
  50 + metas1 = line1_str.split(" ")[0].split(",")
  51 + metas2 = line2_str.split(" ")[0].split(",")
  52 + if(not metas1[0] == metas2[0] or not metas1[3] == metas2[3]):
  53 + return False
  54 + return True
  55 +
  56 +
  57 +
  58 +
  59 +
  60 +# Defining argparse
  61 +parser = argparse.ArgumentParser(prog='Plotter', description='Plot a file of 2d ou 3d dimension')
  62 +parser.add_argument('clusterfile', type=str,
  63 + help='the path of the cluster file')
  64 +parser.add_argument('vectorfile', type=str,
  65 + help='the path of the vectors file')
  66 +parser.add_argument('-o-', '--output', type=str,
  67 + default='plot.pdf',
  68 + help='the path of the ploted file')
  69 +parser.add_argument('-t', '--toy', action='store_true',
  70 + help='test the script on a toy example. Do not test all the file content')
  71 +
  72 +args = parser.parse_args()
  73 +
  74 +# Editing global variable
  75 +CLUSTERFILE_PATH=args.clusterfile
  76 +VECTORFILE_PATH=args.vectorfile
  77 +OUTFILE_PATH = args.output
  78 +TOY_VERSION = args.toy
  79 +
  80 +if check_files(VECTORFILE_PATH, CLUSTERFILE_PATH) == False:
  81 + print("Les fichiers ne sont pas dans le meme ordre. Dans une version futur, cela gรฉnรฉrera une exception. On stop le processus.")
  82 + exit(1)
  83 +
  84 +# Get Vectors
  85 +metas, vectors = read_vector_file(VECTORFILE_PATH, toy_version = TOY_VERSION)
  86 +vectors_T = np.transpose(vectors)
  87 +
  88 +# Get Clusters
  89 +metas, clusters = read_vector_file(CLUSTERFILE_PATH, toy_version = TOY_VERSION)
  90 +
  91 +#print(np.transpose(clusters)[0])
  92 +#print(np.transpose(metas)[0])
  93 +df = pd.DataFrame(dict(
  94 + x=vectors_T[0],
  95 + y=vectors_T[1],
  96 + cluster=np.transpose(clusters)[0]
  97 + ))
  98 +
  99 +groups = df.groupby('cluster')
  100 +
  101 +# Plot
  102 +fig, ax = plt.subplots()
  103 +
  104 +for cluster, group in groups:
  105 + ax.plot(group.x, group.y, marker='o', linestyle='', ms=2, label=cluster)
  106 +ax.legend()
  107 +plt.savefig(OUTFILE_PATH)
bin/tsne_clustering_plot.py
  1 +'''
  2 +Take one file with clustering
  3 +Take an other file with tsne
  4 +and then plot them
  5 +'''
  1 +'''
  2 +The goal of this script is to display calculate tsne of pvectors.
  3 +'''
  4 +
  5 +import os
  6 +import argparse
  7 +import numpy as np
  8 +from sklearn.manifold import TSNE
  9 +
  10 +# Defining argparse
  11 +parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d')
  12 +parser.add_argument('filepath', type=str,
  13 + help='the path of the file you want to calculate tsne')
  14 +parser.add_argument('-o', '--output', type=str,
  15 + default='.',
  16 + help='the path of the output file.')
  17 +parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3],
  18 + default='2',
  19 + help='number of components output of tsne')
  20 +parser.add_argument('-t', '--toy', action='store_true',
  21 + help='test the script on a toy example. Do not test all the file content.')
  22 +args = parser.parse_args()
  23 +
  24 +# Editing global variable
  25 +FILE_PATH=args.filepath
  26 +OUTFILE_PATH=args.output
  27 +TOY_VERSION=args.toy
  28 +N_COMP=args.n_comp
  29 +
  30 +# Defining pvectors with default number of column
  31 +pvectors = np.empty((0, 64), np.float32)
  32 +metas = np.empty((0, 4), np.float32)
  33 +
  34 +
  35 +# READ DATA
  36 +with open(os.path.join(FILE_PATH), "r") as f:
  37 + for i, line in enumerate(f):
  38 + if TOY_VERSION == True and i > 100:
  39 + break
  40 + spl_line = line.split(" ")
  41 + if(len(pvectors) == 0):
  42 + pvectors = np.empty((0, len(spl_line[1:])), np.float32)
  43 + metas = np.append(
  44 + metas,
  45 + np.asarray([spl_line[0].split(",")]),
  46 + axis=0)
  47 + pvectors = np.append(
  48 + pvectors,
  49 + np.asarray([spl_line[1:]], dtype=np.float32),
  50 + axis=0)
  51 +
  52 +
  53 +
  54 +# PREPARE SAVE FILE FUNCTION
  55 +def save_file(filepath, metas, values):
  56 + with open(filepath, "w") as f:
  57 + for i, value in enumerate(values):
  58 + metas_str = ",".join(str(v) for v in metas[i])
  59 + try:
  60 + infos_str = " ".join(str(v) for v in values[i])
  61 + except TypeError as te:
  62 + infos_str = str(values[i])
  63 + f.write(metas_str + " " + infos_str + "\n")
  64 +
  65 +# CALCULATE T-SNE
  66 +X_embedded = TSNE(n_components=N_COMP).fit_transform(pvectors)
  67 +save_file(OUTFILE_PATH, metas, X_embedded)