Quillot Mathias / Clustering

Browse Code »

Commit ac78b07ea0ab18b7855f1b752e90fcac99440c98

Authored by Mathias Quillot 2019-06-18 13:00:31 +0200

1 parent b8acebc1ed

Exists in master

All base bin files added

Showing 11 changed files with 751 additions and 0 deletions Inline Diff

bin/__pycache__/data.cpython-36.pyc
bin/cluster_kmeans.py
bin/clustering_pvector.py
bin/data.py
bin/extract_kmeans.py
bin/extract_vectors.py
bin/plot.py
bin/plot_character.py
bin/plot_clusters.py
bin/tsne_clustering_plot.py
bin/tsne_pvector.py

bin/__pycache__/data.cpython-36.pyc

Diff comments View file @ ac78b07

No preview for this file type

bin/cluster_kmeans.py

Diff comments View file @ ac78b07

File was created	1	'''
	2	This script aims in computing k-means for a given
	3	data set.
	4	'''
	5
	6	import argparse
	7	import numpy as np
	8	from sklearn.cluster import KMeans
	9	from os import path
	10
	11	import pickle
	12	from data import read_file, index_by_id
	13
	14	# -- ARGPARSE --
	15	parser = argparse.ArgumentParser(description="Cluster with kmeans")
	16	parser.add_argument("features", type=str, help="Features file")
	17	parser.add_argument("list", type=str, help="List on which apply kmeans")
	18	parser.add_argument("outdir", type=str, help="Output directory for k-means models")
	19	parser.add_argument("--kmin", type=int, help="minimum k", default=2)
	20	parser.add_argument("--kmax", type=int, help="maximum k", default=100)
	21
	22	args = vars(parser.parse_args())
	23	FEATURES = args["features"]
	24	LST = args["list"]
	25	OUTDIR = args["outdir"]
	26	KMIN = args["kmin"]
	27	KMAX = args["kmax"]
	28
	29	# -- READE FILES --
	30	features = read_file(FEATURES)
	31	feat_ind = index_by_id(features)
	32
	33	lst = read_file(LST)
	34
	35	# -- TRANSFORM INTO NUMPY --
	36	X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst])
	37
	38	Ks = range(KMIN, KMAX+1)
	39	for k in Ks:
	40	kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
	41	pickle.dump(kmeans, open(path.join(OUTDIR, "clustering_" + str(k) + ".pkl"), "wb"))
	42
	43

bin/clustering_pvector.py

Diff comments View file @ ac78b07

File was created	1	'''
	2	The goal of this script is to apply a clustering to pvector in order to find new classes assigned for each utterance or frame.
	3	This new class can be used for new training systems, replacing character classes for example by the calculatering classes from clustering.
	4	We hope this will generate interesting classes that will help the system to understand the structure of the voices.
	5
	6	TODO: Change it in such a way as to take a number (1, 2, 3, 4) and calculate everything needed like clustering. Train on the train set and then project the test on this clustering in order to know to what cluster it belong to.
	7	'''
	8
	9	import os
	10	import numpy as np
	11	from sklearn.cluster import KMeans
	12	import matplotlib.pyplot as plt
	13	import argparse
	14	import pandas as pd
	15	import pickle
	16
	17
	18	'''
	19	Return data in panda format version
	20	'''
	21	def read_vecfile(filepath, toy_version=False):
	22	vectors = ""
	23	metas = ""
	24	with open(filepath, "r") as f:
	25	for i, line in enumerate(f):
	26	if toy_version == True and i > 100:
	27	break
	28	spl_line = line.split(" ")
	29
	30	if(len(vectors) == 0):
	31	vectors = np.empty((0, len(spl_line[1:])), np.float32)
	32	metas = np.empty((0, len(spl_line[0].split(","))))
	33
	34	# Then we add the current line to the data
	35	metas = np.append(
	36	metas,
	37	np.asarray([spl_line[0].split(",")]),
	38	axis=0)
	39
	40	vectors = np.append(
	41	vectors,
	42	np.asarray([spl_line[1:]], dtype=np.float32),
	43	axis=0)
	44	return (metas, vectors)
	45
	46	'''
	47	Return list of metas of the listfile
	48	'''
	49	def read_lstfile(filepath, toy_version=False):
	50	metas = np.empty((0, 4))
	51	with open(filepath, "r") as f:
	52	for i, line in enumerate(f):
	53	if toy_version == True and i > 100:
	54	break
	55	metas = np.append(
	56	metas,
	57	np.asarray([line.rstrip('\n').split(",")]),
	58	axis=0)
	59	return metas
	60
	61	'''
	62	Save a vector file from metas and vector values
	63	'''
	64	def save_file(filepath, metas, values=None):
	65	with open(filepath, "w") as f:
	66	for i in range(len(metas)):
	67	metas_str = ",".join(str(v) for v in metas[i])
	68	if not values == None:
	69	try:
	70	infos_str = " ".join(str(v) for v in values[i])
	71	except TypeError as te:
	72	infos_str = str(values[i])
	73	f.write(metas_str + " " + infos_str + "\n")
	74	else:
	75	f.write(metas_str + "\n")
	76
	77	'''
	78	Take the data and index them.
	79	'''
	80	def index_data(metas, vectors):
	81	data = {}
	82	data["en-us"] = {}
	83	data["fr-fr"] = {}
	84	for i, vector in enumerate(vectors):
	85	meta = metas[i]
	86	data[meta[0]][meta[3]] = {}
	87	data[meta[0]][meta[3]]["metas"] = meta
	88	data[meta[0]][meta[3]]["vector"] = vector
	89	return data
	90
	91
	92
	93	'''
	94	Récupère un sous ensemble des données de base à partir d'une
	95	liste.
	96	'''
	97	def get_subdata(data, lst):
	98	metas = ""
	99	vectors = ""
	100	for meta in lst:
	101	vector = data[meta[0]][meta[3]]["vector"]
	102	if(len(metas) == 0):
	103	metas = np.empty((0, len(meta)))
	104	vectors = np.empty((0, len(vector)), np.float64)
	105	metas = np.append(
	106	metas,
	107	np.asarray([data[meta[0]][meta[3]]["metas"]]),
	108	axis=0)
	109	vectors = np.append(
	110	vectors,
	111	np.asarray([vector]),
	112	axis=0)
	113	return metas, vectors
	114
	115
	116	'''
	117	Apply clustering on data of filename.
	118	Use a list to determine train et test using train valid, test.
	119	Save the file with the given suffix.
	120	Check the existence of the files before calculating and saving,
	121	if the two files already exist, it will not calculate it again.
	122
	123	However: if one file is not present, this function will calculate
	124	it again.
	125
	126	TODO: Add a variable to force the calculation of all the files
	127	even if they exist.
	128	'''
	129	def apply_clustering(filename, dir_lst, dir_data, suffix_outfile):
	130
	131	# Applicate it for normal version
	132	metas, vectors = read_vecfile(os.path.join(dir_data, filename), toy_version=False)
	133	data = index_data(metas, vectors)
	134
	135	### CURSOR
	136	# Get Train
	137	train_lst = read_lstfile(os.path.join(dir_lst, "train_" + str(NUMBER) + ".lst"))
	138	train_metas, train_vectors = get_subdata(data, train_lst)
	139
	140	# Get Val
	141	val_lst = read_lstfile(os.path.join(dir_lst, "val_" + str(NUMBER) + ".lst"))
	142	val_metas, val_vectors = get_subdata(data, val_lst)
	143
	144	# Get Test
	145	test_lst = read_lstfile(os.path.join(dir_lst, "test_" + str(NUMBER) + ".lst"))
	146	test_metas, test_vectors = get_subdata(data, test_lst)
	147
	148	# Verif shapes
	149	print("verif shapes")
	150	print(train_metas.shape)
	151	print(val_metas.shape)
	152	print(test_metas.shape)
	153
	154	# Entrainer le k-means sur le train + val
	155	#Ks = [12, 24, 48]
	156
	157	print("k=[", end="")
	158	Ks = [6,12,24,48,64]
	159	for k in Ks:
	160	# Process the name
	161	suffix = "_" + suffix_outfile if not suffix_outfile == "" else ""
	162	k_str = "{:03d}".format(k) # K in string
	163	filename_pickle = os.path.join(
	164	DIR_DATA,
	165	"clusters_trained_on_train_" +str(k_str)+ "_pickle_" + suffix + ".txt")
	166	filename_clusters = os.path.join(
	167	DIR_DATA,
	168	"clusters_trained_on_train_" +str(k_str)+ suffix + ".txt")
	169
	170	# Check if on of the two file does not exist
	171	condition = not(
	172	os.path.exists(filename_pickle)
	173	and os.path.exists(filename_clusters)
	174	)
	175
	176	if condition:
	177	print(str(k)+",", end=" ")
	178	kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(
	179	train_vectors)
	180	test_pred = kmeans.predict(np.concatenate((val_vectors, test_vectors), axis=0))
	181	metas_tosave = np.concatenate([train_metas, val_metas, test_metas], axis=0)
	182	values_tosave = np.concatenate([kmeans.labels_, test_pred], axis=0)
	183	metas_tosave[:, 1] = values_tosave # Replace char by clusters
	184	save_file(filename_clusters, metas_tosave)
	185	pickle.dump(kmeans, open( filename_pickle, "wb" ) )
	186	print("]")
	187
	188	for NUMBER in range(1, 5):
	189	print("JACKKNIFING NUMBER: " + str(NUMBER))
	190	DIR_MAIN="exp/pvector-1"
	191	DIR_DATA=os.path.join(DIR_MAIN, str(NUMBER))
	192	DIR_LST=os.path.join(DIR_MAIN, "lst")
	193	OUTFILE_NAME="clustering"
	194
	195	print("Calculating mass_effect_pvectors")
	196	apply_clustering("masseffect_pvectors.txt",
	197	dir_lst = os.path.join(DIR_MAIN, "lst"),
	198	dir_data = DIR_DATA,
	199	suffix_outfile = "")
	200
	201	print("Calculating mass_effect_pvectors_final")
	202	apply_clustering("masseffect_pvectors_final.txt",
	203	dir_lst = os.path.join(DIR_MAIN, "lst"),
	204	dir_data = DIR_DATA,
	205	suffix_outfile = "final")
	206

bin/data.py

Diff comments View file @ ac78b07

File was created	1	'''
	2	This module aim in loading and writing files.
	3	Our files respect a specific format that
	4	is not standard. This is why i hope these
	5	function make the read of file easier.
	6
	7	For more information about the data, read
	8	the README file please.
	9	'''
	10
	11	import sys
	12
	13	def read_file(filepath):
	14	'''
	15	Read the file and return an array with pairs
	16	where each pair is composed by the metas and the
	17	features.
	18	'''
	19	data = []
	20	with open(filepath, "r") as f:
	21	for line in f:
	22	splited = line.replace("\n", "").split(" ")
	23	metas = splited[0].split(",")
	24	features = splited[1:]
	25	data.append((metas, features))
	26	return data
	27
	28
	29	def index_by(data, num_col):
	30	'''
	31	Allows the user to index data by number of columns.
	32	'''
	33	indexed = {}
	34	for line in data:
	35	metas = line[0]
	36	features = line[1]
	37	if metas[num_col] not in indexed:
	38	indexed[metas[num_col]] = []
	39	indexed[metas[num_col]].append((metas, features))
	40	return indexed
	41
	42
	43	def index_by_id(data):
	44	'''
	45	Allows the user to index data by id.
	46	Index data by id consists in indexing two times
	47	because data have two keys. On with the language
	48	and the other one with the id of the sentence.
	49	'''
	50	indexed = {}
	51	for line in data:
	52	metas = line[0]
	53	id_sen = metas[3]
	54	lang = metas[0]
	55	if lang not in indexed:
	56	indexed[lang] = {}
	57	indexed[lang][id_sen] = line
	58	return indexed
	59
	60
	61	def write_line(metas, features, f=sys.stdout):
	62	'''
	63	Just print the line. No need to specify a file.
	64
	65	metas: meta information on list
	66	features: feature vector
	67	f: file to write it
	68	'''
	69	print(",".join(metas) + " " + " ".join(features), file=f)
	70

bin/extract_kmeans.py

Diff comments View file @ ac78b07

File was created	1	'''
	2	This script aims to extract k-means clustering from a
	3	a priori trained k-means.
	4	'''
	5
	6	import argparse
	7	import numpy as np
	8	import pickle
	9	from data import read_file, index_by_id, write_line
	10	import sys
	11
	12	# -- ARGPARSE --
	13	parser = argparse.ArgumentParser(description="extract clusters")
	14	parser.add_argument("model", type=str, help="k-means model pickle")
	15	parser.add_argument("features", type=str, help="features")
	16	parser.add_argument("list", type=str, help="list file")
	17	parser.add_argument("--outfile", type=str, default=None, help="output file std")
	18
	19	args = vars(parser.parse_args())
	20	MODEL = args["model"]
	21	FEATURES = args["features"]
	22	LST = args["list"]
	23	OUTFILE = args["outfile"]
	24
	25	if OUTFILE == None:
	26	OUTFILE = sys.stdout
	27	else:
	28	OUTFILE = open(OUTFILE, "w")
	29
	30	# -- READ FILE --
	31	features = read_file(FEATURES)
	32	feat_ind = index_by_id(features)
	33
	34	lst = read_file(LST)
	35
	36	kmeans = pickle.load(open(MODEL, "rb"))
	37
	38
	39	# -- CONVERT TO NUMPY --
	40	X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst])
	41	predictions = kmeans.predict(X)
	42
	43	for i, line in enumerate(lst):
	44	meta = line[0]
	45	meta[1] = str(predictions[i])
	46	write_line(
	47	meta,
	48	feat_ind[meta[0]][meta[3]][1],
	49	OUTFILE
	50	)
	51
	52	# -- CLOSE OUT FILE IF NECESSARY --
	53	if not OUTFILE == sys.stdout:
	54	OUTFILE.close()

bin/extract_vectors.py

Diff comments View file @ ac78b07

File was created	1	'''
	2	The goal of this script is to extract vectors from a list.
	3	One file is the full content, and the list only enumerate the
	4	vectors you want to keep.
	5	'''
	6
	7	import os
	8	import numpy as np
	9	import argparse
	10
	11	parser = argparse.ArgumentParser(description='Extract a subset of vectors')
	12	parser.add_argument('vectorsfile', type=str,
	13	help='the path of the file containing the convectors')
	14	parser.add_argument('listfile', type=str,
	15	help='the path of the file containing the list of vectors kept')
	16	parser.add_argument('-o', '--output', type=str,
	17	default='a.out',
	18	help='the path the output file containing the vectors kept')
	19
	20	args = parser.parse_args()
	21
	22	# Editing global variable
	23	VECTOR_FILE = args.vectorsfile
	24	LIST_FILE = args.listfile
	25	OUTPUT_FILE = args.output
	26
	27	# READ VECTOR DATA
	28	data = {}
	29	data["en-us"] = {}
	30	data["fr-fr"] = {}
	31	with open(VECTOR_FILE, "r") as f:
	32	for i, line in enumerate(f):
	33	if TOY_VERSION == True and i > 100:
	34	break
	35	spl_line = line.split(" ")
	36	if(len(pvectors) == 0):
	37	pvectors = np.empty((0, len(spl_line[1:])), np.float32)
	38	spl_meta = spl_line.split(",")
	39	lang = spl_meta[0]
	40	iden = spl_meta[3]
	41	data[lang][iden] = line
	42
	43	# READ LIST AND WRITE NEW FILE
	44	with open(LIST_FILE, "r") as f, open(OUTPUT_FILE, "w") as o:
	45	for i, line in enumerate(LIST_FILE):
	46	if TOY_VERSION == True and i > 100:
	47	break
	48	spl_meta = line.split(",")
	49	lang = spl_meta[0]
	50	iden = spl_meta[3]
	51	OUTPUT_FILE.write(data[lang][iden])
	52
	53

bin/plot.py

Diff comments View file @ ac78b07

File was created	1	'''
	2	Take a file and plot its data onto a 2d or 3d axis depending on the data.
	3	'''
	4
	5	import os
	6	import numpy as np
	7	from sklearn.cluster import KMeans
	8	import matplotlib.pyplot as plt
	9	import argparse
	10	import json
	11
	12	# Defining argparse
	13	parser = argparse.ArgumentParser(prog='Plotter', description='Plot a file of 2d ou 3d dimension')
	14	parser.add_argument('filepath', type=str,
	15	help='the path of the file you want to plot')
	16	parser.add_argument('-o-', '--output', type=str,
	17	default='plot.pdf',
	18	help='the path of the ploted file')
	19	parser.add_argument('-t', '--toy', action='store_true',
	20	help='test the script on a toy example. Do not test all the file content')
	21
	22	args = parser.parse_args()
	23
	24	# Editing global variable
	25	FILE_PATH=args.filepath
	26	OUTFILE_PATH = args.output
	27	TOY_VERSION = args.toy
	28
	29	# Defining vectors with default number of column
	30	vectors = np.empty((0, 64), np.float32)
	31	metas = np.empty((0, 4), np.float32)
	32
	33	# READ DATA
	34	with open(os.path.join(FILE_PATH), "r") as f:
	35	for i, line in enumerate(f):
	36	if TOY_VERSION == True and i > 100:
	37	break
	38	spl_line = line.split(" ")
	39	if(len(vectors) == 0):
	40	vectors = np.empty((0, len(spl_line[1:])), np.float32)
	41	metas = np.append(
	42	metas,
	43	np.asarray([spl_line[0].split(",")]),
	44	axis=0)
	45
	46	vectors = np.append(
	47	vectors,
	48	np.asarray([spl_line[1:]], dtype=np.float32),
	49	axis=0)
	50
	51	vectors_T = np.transpose(vectors)
	52
	53
	54	# Plot the file
	55	plt.plot(vectors, 'ro')
	56	fig, ax = plt.subplots()
	57
	58	if(vectors_T.shape[0] == 2):
	59	ax.scatter(vectors_T[0], vectors_T[1]) #c=close, s=volume, alpha=0.5)
	60	else:
	61	ax.scatter(vectors_T[0], vectors_T[1], vectors_T[2])
	62
	63	ax.set_xlabel('Axe 1', fontsize=15)
	64	ax.set_ylabel('Axe 2', fontsize=15)
	65
	66	if(vectors_T.shape[0] == 3):
	67	ax.set_zlabel('Axe 3', fontsize15=15)
	68
	69	ax.set_title('Volume and percent change')
	70	plt.savefig(OUTFILE_PATH)
	71

bin/plot_character.py

Diff comments View file @ ac78b07

File was created	1	'''
	2	Take a file and plot its data onto a 2d or 3d axis depending on the data.
	3	Automatic detection of the number of dimension.
	4	'''
	5
	6	import os
	7	import numpy as np
	8	from sklearn.cluster import KMeans
	9	import matplotlib.pyplot as plt
	10	import argparse
	11	import json
	12	import pandas as pd
	13
	14	# Defining useful functions
	15
	16	'''
	17	Read the file whose content is metas and vectors.
	18	Returns two numpy array : (metas, vectors)
	19
	20	'''
	21	def read_vector_file(filename, toy_version=False):
	22	vectors = np.empty((0, 1), np.float32)
	23	metas = np.empty((0, 4), np.float32)
	24	with open(filename, "r") as f:
	25	for i, line in enumerate(f):
	26	if toy_version == True and i > 100:
	27	break
	28	spl_line = line.split(" ")
	29	if(len(vectors) == 0):
	30	vectors = np.empty((0, len(spl_line[1:])), np.float32)
	31	metas = np.append(
	32	metas,
	33	np.asarray([spl_line[0].split(",")]),
	34	axis=0)
	35
	36	vectors = np.append(
	37	vectors,
	38	np.asarray([spl_line[1:]], dtype=np.float32),
	39	axis=0)
	40	return (metas, vectors)
	41
	42
	43	# Defining argparse
	44	parser = argparse.ArgumentParser(description='Plot a file of 2d ou 3d dimension')
	45	parser.add_argument('vectorfile', type=str,
	46	help='the path of the vectors file')
	47	parser.add_argument('-o-', '--output', type=str,
	48	default='plot.pdf',
	49	help='the path of the ploted file')
	50	parser.add_argument('-t', '--toy', action='store_true',
	51	help='test the script on a toy example. Do not test all the file content')
	52
	53	args = parser.parse_args()
	54
	55	# Editing global variable
	56	VECTORFILE_PATH=args.vectorfile
	57	OUTFILE_PATH = args.output
	58	TOY_VERSION = args.toy
	59
	60
	61	# Get Vectors
	62	metas, vectors = read_vector_file(VECTORFILE_PATH, toy_version = TOY_VERSION)
	63	vectors_T = np.transpose(vectors)
	64
	65	print("Number of characters: " + str(len(np.unique(np.transpose(metas)[1]))))
	66	df = pd.DataFrame(dict(
	67	x=vectors_T[0],
	68	y=vectors_T[1],
	69	character=np.transpose(metas)[1]
	70	))
	71
	72	groups = df.groupby('character')
	73
	74	# Plot
	75	fig, ax = plt.subplots()
	76
	77	for character, group in groups:
	78	ax.plot(group.x, group.y, marker='o', linestyle='', ms=2, label=character)
	79	plt.savefig(OUTFILE_PATH)
	80	print("Your plot is saved well (no check of this affirmation)")
	81

bin/plot_clusters.py

Diff comments View file @ ac78b07

File was created	1	'''
	2	Take a file and plot its data onto a 2d or 3d axis depending on the data.
	3	'''
	4
	5	import os
	6	import numpy as np
	7	from sklearn.cluster import KMeans
	8	import matplotlib.pyplot as plt
	9	import argparse
	10	import json
	11	import pandas as pd
	12
	13	# Defining useful functions
	14
	15	'''
	16	Read the file whose content is metas and vectors.
	17	Returns two numpy array : (metas, vectors)
	18
	19	'''
	20	def read_vector_file(filename, toy_version=False):
	21	vectors = np.empty((0, 1), np.float32)
	22	metas = np.empty((0, 4), np.float32)
	23	with open(filename, "r") as f:
	24	for i, line in enumerate(f):
	25	if toy_version == True and i > 100:
	26	break
	27	spl_line = line.split(" ")
	28	if(len(vectors) == 0):
	29	vectors = np.empty((0, len(spl_line[1:])), np.float32)
	30	metas = np.append(
	31	metas,
	32	np.asarray([spl_line[0].split(",")]),
	33	axis=0)
	34
	35	vectors = np.append(
	36	vectors,
	37	np.asarray([spl_line[1:]], dtype=np.float32),
	38	axis=0)
	39	return (metas, vectors)
	40
	41
	42	'''
	43	Check if the two given files have the same order.
	44	'''
	45	def check_files(vector_file, cluster_file):
	46	with open(vector_file, "r") as f1, open(cluster_file, "r") as f2:
	47	for line1, line2 in zip(f1, f2):
	48	line1_str = line1.strip()
	49	line2_str = line2.strip()
	50	metas1 = line1_str.split(" ")[0].split(",")
	51	metas2 = line2_str.split(" ")[0].split(",")
	52	if(not metas1[0] == metas2[0] or not metas1[3] == metas2[3]):
	53	return False
	54	return True
	55
	56
	57
	58
	59
	60	# Defining argparse
	61	parser = argparse.ArgumentParser(prog='Plotter', description='Plot a file of 2d ou 3d dimension')
	62	parser.add_argument('clusterfile', type=str,
	63	help='the path of the cluster file')
	64	parser.add_argument('vectorfile', type=str,
	65	help='the path of the vectors file')
	66	parser.add_argument('-o-', '--output', type=str,
	67	default='plot.pdf',
	68	help='the path of the ploted file')
	69	parser.add_argument('-t', '--toy', action='store_true',
	70	help='test the script on a toy example. Do not test all the file content')
	71
	72	args = parser.parse_args()
	73
	74	# Editing global variable
	75	CLUSTERFILE_PATH=args.clusterfile
	76	VECTORFILE_PATH=args.vectorfile
	77	OUTFILE_PATH = args.output
	78	TOY_VERSION = args.toy
	79
	80	if check_files(VECTORFILE_PATH, CLUSTERFILE_PATH) == False:
	81	print("Les fichiers ne sont pas dans le meme ordre. Dans une version futur, cela générera une exception. On stop le processus.")
	82	exit(1)
	83
	84	# Get Vectors
	85	metas, vectors = read_vector_file(VECTORFILE_PATH, toy_version = TOY_VERSION)
	86	vectors_T = np.transpose(vectors)
	87
	88	# Get Clusters
	89	metas, clusters = read_vector_file(CLUSTERFILE_PATH, toy_version = TOY_VERSION)
	90
	91	#print(np.transpose(clusters)[0])
	92	#print(np.transpose(metas)[0])
	93	df = pd.DataFrame(dict(
	94	x=vectors_T[0],
	95	y=vectors_T[1],
	96	cluster=np.transpose(clusters)[0]
	97	))
	98
	99	groups = df.groupby('cluster')
	100
	101	# Plot
	102	fig, ax = plt.subplots()
	103
	104	for cluster, group in groups:
	105	ax.plot(group.x, group.y, marker='o', linestyle='', ms=2, label=cluster)
	106	ax.legend()
	107	plt.savefig(OUTFILE_PATH)
	108

bin/tsne_clustering_plot.py

Diff comments View file @ ac78b07

File was created	1	'''
	2	Take one file with clustering
	3	Take an other file with tsne
	4	and then plot them
	5	'''
	6

bin/tsne_pvector.py

Diff comments View file @ ac78b07

File was created	1	'''
	2	The goal of this script is to display calculate tsne of pvectors.
	3	'''
	4
	5	import os
	6	import argparse
	7	import numpy as np
	8	from sklearn.manifold import TSNE
	9
	10	# Defining argparse
	11	parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d')
	12	parser.add_argument('filepath', type=str,
	13	help='the path of the file you want to calculate tsne')
	14	parser.add_argument('-o', '--output', type=str,
	15	default='.',
	16	help='the path of the output file.')
	17	parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3],
	18	default='2',
	19	help='number of components output of tsne')
	20	parser.add_argument('-t', '--toy', action='store_true',
	21	help='test the script on a toy example. Do not test all the file content.')
	22	args = parser.parse_args()
	23
	24	# Editing global variable
	25	FILE_PATH=args.filepath
	26	OUTFILE_PATH=args.output
	27	TOY_VERSION=args.toy
	28	N_COMP=args.n_comp
	29
	30	# Defining pvectors with default number of column
	31	pvectors = np.empty((0, 64), np.float32)
	32	metas = np.empty((0, 4), np.float32)
	33
	34
	35	# READ DATA
	36	with open(os.path.join(FILE_PATH), "r") as f:
	37	for i, line in enumerate(f):
	38	if TOY_VERSION == True and i > 100:
	39	break
	40	spl_line = line.split(" ")
	41	if(len(pvectors) == 0):
	42	pvectors = np.empty((0, len(spl_line[1:])), np.float32)
	43	metas = np.append(
	44	metas,
	45	np.asarray([spl_line[0].split(",")]),
	46	axis=0)
	47	pvectors = np.append(
	48	pvectors,
	49	np.asarray([spl_line[1:]], dtype=np.float32),
	50	axis=0)
	51
	52
	53
	54	# PREPARE SAVE FILE FUNCTION
	55	def save_file(filepath, metas, values):
	56	with open(filepath, "w") as f:
	57	for i, value in enumerate(values):
	58	metas_str = ",".join(str(v) for v in metas[i])
	59	try:
	60	infos_str = " ".join(str(v) for v in values[i])
	61	except TypeError as te:
	62	infos_str = str(values[i])
	63	f.write(metas_str + " " + infos_str + "\n")
	64
	65	# CALCULATE T-SNE
	66	X_embedded = TSNE(n_components=N_COMP).fit_transform(pvectors)
	67	save_file(OUTFILE_PATH, metas, X_embedded)
	68