tsne_pvector.py 2.23 KB
'''
The goal of this script is to display calculate tsne of pvectors.
'''

import os
import argparse
import numpy as np
from sklearn.manifold import TSNE

# Defining argparse 
parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d')
parser.add_argument('filepath', type=str,
                    help='the path of the file you want to calculate tsne')
parser.add_argument('-o', '--output', type=str,
                    default='.',
                    help='the path of the output file.')
parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3],
                    default='2',
                    help='number of components output of tsne')
parser.add_argument('-t', '--toy', action='store_true',
                    help='test the script on a toy example. Do not test all the file content.')
args = parser.parse_args()

# Editing global variable
FILE_PATH=args.filepath
OUTFILE_PATH=args.output
TOY_VERSION=args.toy
N_COMP=args.n_comp

# Defining pvectors with default number of column
pvectors = np.empty((0, 64), np.float32)
metas = np.empty((0, 4), np.float32)


# READ DATA
with open(os.path.join(FILE_PATH), "r") as f:
	for i, line in enumerate(f):
		if TOY_VERSION == True and i > 100:
			break
		spl_line = line.split(" ")
		if(len(pvectors) == 0):
			pvectors = np.empty((0, len(spl_line[1:])), np.float32)
		metas = np.append(
			metas,
			np.asarray([spl_line[0].split(",")]),
			axis=0)
		pvectors = np.append(
                        pvectors,
                        np.asarray([spl_line[1:]], dtype=np.float32),
                        axis=0)



# PREPARE SAVE FILE FUNCTION
def save_file(filepath, metas, values):
        with open(filepath, "w") as f:
                for i, value  in enumerate(values):
                        metas_str = ",".join(str(v) for v in metas[i])
                        try:
                                infos_str = " ".join(str(v) for v in values[i])
                        except TypeError as te:
                                infos_str = str(values[i])
                        f.write(metas_str + " " + infos_str + "\n")

# CALCULATE T-SNE
X_embedded = TSNE(n_components=N_COMP).fit_transform(pvectors)
save_file(OUTFILE_PATH, metas, X_embedded)