Quillot Mathias / Clustering

Browse Code »

Commit ac78b07ea0ab18b7855f1b752e90fcac99440c98

Authored by Mathias Quillot 2019-06-18 13:00:31 +0200

1 parent b8acebc1ed

Exists in master

All base bin files added

Showing 11 changed files with 751 additions and 0 deletions Side-by-side Diff

bin/__pycache__/data.cpython-36.pyc
bin/cluster_kmeans.py
bin/clustering_pvector.py
bin/data.py
bin/extract_kmeans.py
bin/extract_vectors.py
bin/plot.py
bin/plot_character.py
bin/plot_clusters.py
bin/tsne_clustering_plot.py
bin/tsne_pvector.py

bin/__pycache__/data.cpython-36.pyc

Diff comments View file @ ac78b07

No preview for this file type

bin/cluster_kmeans.py

Diff comments View file @ ac78b07

	1	+'''
	2	+This script aims in computing k-means for a given
	3	+data set.
	4	+'''
	5	+
	6	+import argparse
	7	+import numpy as np
	8	+from sklearn.cluster import KMeans
	9	+from os import path
	10	+
	11	+import pickle
	12	+from data import read_file, index_by_id
	13	+
	14	+# -- ARGPARSE --
	15	+parser = argparse.ArgumentParser(description="Cluster with kmeans")
	16	+parser.add_argument("features", type=str, help="Features file")
	17	+parser.add_argument("list", type=str, help="List on which apply kmeans")
	18	+parser.add_argument("outdir", type=str, help="Output directory for k-means models")
	19	+parser.add_argument("--kmin", type=int, help="minimum k", default=2)
	20	+parser.add_argument("--kmax", type=int, help="maximum k", default=100)
	21	+
	22	+args = vars(parser.parse_args())
	23	+FEATURES = args["features"]
	24	+LST = args["list"]
	25	+OUTDIR = args["outdir"]
	26	+KMIN = args["kmin"]
	27	+KMAX = args["kmax"]
	28	+
	29	+# -- READE FILES --
	30	+features = read_file(FEATURES)
	31	+feat_ind = index_by_id(features)
	32	+
	33	+lst = read_file(LST)
	34	+
	35	+# -- TRANSFORM INTO NUMPY --
	36	+X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst])
	37	+
	38	+Ks = range(KMIN, KMAX+1)
	39	+for k in Ks:
	40	+ kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(X)
	41	+ pickle.dump(kmeans, open(path.join(OUTDIR, "clustering_" + str(k) + ".pkl"), "wb"))

bin/clustering_pvector.py

Diff comments View file @ ac78b07

	1	+'''
	2	+The goal of this script is to apply a clustering to pvector in order to find new classes assigned for each utterance or frame.
	3	+This new class can be used for new training systems, replacing character classes for example by the calculatering classes from clustering.
	4	+We hope this will generate interesting classes that will help the system to understand the structure of the voices.
	5	+
	6	+TODO: Change it in such a way as to take a number (1, 2, 3, 4) and calculate everything needed like clustering. Train on the train set and then project the test on this clustering in order to know to what cluster it belong to.
	7	+'''
	8	+
	9	+import os
	10	+import numpy as np
	11	+from sklearn.cluster import KMeans
	12	+import matplotlib.pyplot as plt
	13	+import argparse
	14	+import pandas as pd
	15	+import pickle
	16	+
	17	+
	18	+'''
	19	+Return data in panda format version
	20	+'''
	21	+def read_vecfile(filepath, toy_version=False):
	22	+ vectors = ""
	23	+ metas = ""
	24	+ with open(filepath, "r") as f:
	25	+ for i, line in enumerate(f):
	26	+ if toy_version == True and i > 100:
	27	+ break
	28	+ spl_line = line.split(" ")
	29	+
	30	+ if(len(vectors) == 0):
	31	+ vectors = np.empty((0, len(spl_line[1:])), np.float32)
	32	+ metas = np.empty((0, len(spl_line[0].split(","))))
	33	+
	34	+ # Then we add the current line to the data
	35	+ metas = np.append(
	36	+ metas,
	37	+ np.asarray([spl_line[0].split(",")]),
	38	+ axis=0)
	39	+
	40	+ vectors = np.append(
	41	+ vectors,
	42	+ np.asarray([spl_line[1:]], dtype=np.float32),
	43	+ axis=0)
	44	+ return (metas, vectors)
	45	+
	46	+'''
	47	+Return list of metas of the listfile
	48	+'''
	49	+def read_lstfile(filepath, toy_version=False):
	50	+ metas = np.empty((0, 4))
	51	+ with open(filepath, "r") as f:
	52	+ for i, line in enumerate(f):
	53	+ if toy_version == True and i > 100:
	54	+ break
	55	+ metas = np.append(
	56	+ metas,
	57	+ np.asarray([line.rstrip('\n').split(",")]),
	58	+ axis=0)
	59	+ return metas
	60	+
	61	+'''
	62	+Save a vector file from metas and vector values
	63	+'''
	64	+def save_file(filepath, metas, values=None):
	65	+ with open(filepath, "w") as f:
	66	+ for i in range(len(metas)):
	67	+ metas_str = ",".join(str(v) for v in metas[i])
	68	+ if not values == None:
	69	+ try:
	70	+ infos_str = " ".join(str(v) for v in values[i])
	71	+ except TypeError as te:
	72	+ infos_str = str(values[i])
	73	+ f.write(metas_str + " " + infos_str + "\n")
	74	+ else:
	75	+ f.write(metas_str + "\n")
	76	+
	77	+'''
	78	+Take the data and index them.
	79	+'''
	80	+def index_data(metas, vectors):
	81	+ data = {}
	82	+ data["en-us"] = {}
	83	+ data["fr-fr"] = {}
	84	+ for i, vector in enumerate(vectors):
	85	+ meta = metas[i]
	86	+ data[meta[0]][meta[3]] = {}
	87	+ data[meta[0]][meta[3]]["metas"] = meta
	88	+ data[meta[0]][meta[3]]["vector"] = vector
	89	+ return data
	90	+
	91	+
	92	+
	93	+'''
	94	+Récupère un sous ensemble des données de base à partir d'une
	95	+liste.
	96	+'''
	97	+def get_subdata(data, lst):
	98	+ metas = ""
	99	+ vectors = ""
	100	+ for meta in lst:
	101	+ vector = data[meta[0]][meta[3]]["vector"]
	102	+ if(len(metas) == 0):
	103	+ metas = np.empty((0, len(meta)))
	104	+ vectors = np.empty((0, len(vector)), np.float64)
	105	+ metas = np.append(
	106	+ metas,
	107	+ np.asarray([data[meta[0]][meta[3]]["metas"]]),
	108	+ axis=0)
	109	+ vectors = np.append(
	110	+ vectors,
	111	+ np.asarray([vector]),
	112	+ axis=0)
	113	+ return metas, vectors
	114	+
	115	+
	116	+'''
	117	+Apply clustering on data of filename.
	118	+Use a list to determine train et test using train valid, test.
	119	+Save the file with the given suffix.
	120	+Check the existence of the files before calculating and saving,
	121	+if the two files already exist, it will not calculate it again.
	122	+
	123	+However: if one file is not present, this function will calculate
	124	+it again.
	125	+
	126	+TODO: Add a variable to force the calculation of all the files
	127	+even if they exist.
	128	+'''
	129	+def apply_clustering(filename, dir_lst, dir_data, suffix_outfile):
	130	+
	131	+ # Applicate it for normal version
	132	+ metas, vectors = read_vecfile(os.path.join(dir_data, filename), toy_version=False)
	133	+ data = index_data(metas, vectors)
	134	+
	135	+ ### CURSOR
	136	+ # Get Train
	137	+ train_lst = read_lstfile(os.path.join(dir_lst, "train_" + str(NUMBER) + ".lst"))
	138	+ train_metas, train_vectors = get_subdata(data, train_lst)
	139	+
	140	+ # Get Val
	141	+ val_lst = read_lstfile(os.path.join(dir_lst, "val_" + str(NUMBER) + ".lst"))
	142	+ val_metas, val_vectors = get_subdata(data, val_lst)
	143	+
	144	+ # Get Test
	145	+ test_lst = read_lstfile(os.path.join(dir_lst, "test_" + str(NUMBER) + ".lst"))
	146	+ test_metas, test_vectors = get_subdata(data, test_lst)
	147	+
	148	+ # Verif shapes
	149	+ print("verif shapes")
	150	+ print(train_metas.shape)
	151	+ print(val_metas.shape)
	152	+ print(test_metas.shape)
	153	+
	154	+ # Entrainer le k-means sur le train + val
	155	+ #Ks = [12, 24, 48]
	156	+
	157	+ print("k=[", end="")
	158	+ Ks = [6,12,24,48,64]
	159	+ for k in Ks:
	160	+ # Process the name
	161	+ suffix = "_" + suffix_outfile if not suffix_outfile == "" else ""
	162	+ k_str = "{:03d}".format(k) # K in string
	163	+ filename_pickle = os.path.join(
	164	+ DIR_DATA,
	165	+ "clusters_trained_on_train_" +str(k_str)+ "_pickle_" + suffix + ".txt")
	166	+ filename_clusters = os.path.join(
	167	+ DIR_DATA,
	168	+ "clusters_trained_on_train_" +str(k_str)+ suffix + ".txt")
	169	+
	170	+ # Check if on of the two file does not exist
	171	+ condition = not(
	172	+ os.path.exists(filename_pickle)
	173	+ and os.path.exists(filename_clusters)
	174	+ )
	175	+
	176	+ if condition:
	177	+ print(str(k)+",", end=" ")
	178	+ kmeans = KMeans(n_clusters=k, n_init=10, random_state=0).fit(
	179	+ train_vectors)
	180	+ test_pred = kmeans.predict(np.concatenate((val_vectors, test_vectors), axis=0))
	181	+ metas_tosave = np.concatenate([train_metas, val_metas, test_metas], axis=0)
	182	+ values_tosave = np.concatenate([kmeans.labels_, test_pred], axis=0)
	183	+ metas_tosave[:, 1] = values_tosave # Replace char by clusters
	184	+ save_file(filename_clusters, metas_tosave)
	185	+ pickle.dump(kmeans, open( filename_pickle, "wb" ) )
	186	+ print("]")
	187	+
	188	+for NUMBER in range(1, 5):
	189	+ print("JACKKNIFING NUMBER: " + str(NUMBER))
	190	+ DIR_MAIN="exp/pvector-1"
	191	+ DIR_DATA=os.path.join(DIR_MAIN, str(NUMBER))
	192	+ DIR_LST=os.path.join(DIR_MAIN, "lst")
	193	+ OUTFILE_NAME="clustering"
	194	+
	195	+ print("Calculating mass_effect_pvectors")
	196	+ apply_clustering("masseffect_pvectors.txt",
	197	+ dir_lst = os.path.join(DIR_MAIN, "lst"),
	198	+ dir_data = DIR_DATA,
	199	+ suffix_outfile = "")
	200	+
	201	+ print("Calculating mass_effect_pvectors_final")
	202	+ apply_clustering("masseffect_pvectors_final.txt",
	203	+ dir_lst = os.path.join(DIR_MAIN, "lst"),
	204	+ dir_data = DIR_DATA,
	205	+ suffix_outfile = "final")

bin/data.py

Diff comments View file @ ac78b07

	1	+'''
	2	+This module aim in loading and writing files.
	3	+Our files respect a specific format that
	4	+is not standard. This is why i hope these
	5	+function make the read of file easier.
	6	+
	7	+For more information about the data, read
	8	+the README file please.
	9	+'''
	10	+
	11	+import sys
	12	+
	13	+def read_file(filepath):
	14	+ '''
	15	+ Read the file and return an array with pairs
	16	+ where each pair is composed by the metas and the
	17	+ features.
	18	+ '''
	19	+ data = []
	20	+ with open(filepath, "r") as f:
	21	+ for line in f:
	22	+ splited = line.replace("\n", "").split(" ")
	23	+ metas = splited[0].split(",")
	24	+ features = splited[1:]
	25	+ data.append((metas, features))
	26	+ return data
	27	+
	28	+
	29	+def index_by(data, num_col):
	30	+ '''
	31	+ Allows the user to index data by number of columns.
	32	+ '''
	33	+ indexed = {}
	34	+ for line in data:
	35	+ metas = line[0]
	36	+ features = line[1]
	37	+ if metas[num_col] not in indexed:
	38	+ indexed[metas[num_col]] = []
	39	+ indexed[metas[num_col]].append((metas, features))
	40	+ return indexed
	41	+
	42	+
	43	+def index_by_id(data):
	44	+ '''
	45	+ Allows the user to index data by id.
	46	+ Index data by id consists in indexing two times
	47	+ because data have two keys. On with the language
	48	+ and the other one with the id of the sentence.
	49	+ '''
	50	+ indexed = {}
	51	+ for line in data:
	52	+ metas = line[0]
	53	+ id_sen = metas[3]
	54	+ lang = metas[0]
	55	+ if lang not in indexed:
	56	+ indexed[lang] = {}
	57	+ indexed[lang][id_sen] = line
	58	+ return indexed
	59	+
	60	+
	61	+def write_line(metas, features, f=sys.stdout):
	62	+ '''
	63	+ Just print the line. No need to specify a file.
	64	+
	65	+ metas: meta information on list
	66	+ features: feature vector
	67	+ f: file to write it
	68	+ '''
	69	+ print(",".join(metas) + " " + " ".join(features), file=f)

bin/extract_kmeans.py

Diff comments View file @ ac78b07

	1	+'''
	2	+This script aims to extract k-means clustering from a
	3	+a priori trained k-means.
	4	+'''
	5	+
	6	+import argparse
	7	+import numpy as np
	8	+import pickle
	9	+from data import read_file, index_by_id, write_line
	10	+import sys
	11	+
	12	+# -- ARGPARSE --
	13	+parser = argparse.ArgumentParser(description="extract clusters")
	14	+parser.add_argument("model", type=str, help="k-means model pickle")
	15	+parser.add_argument("features", type=str, help="features")
	16	+parser.add_argument("list", type=str, help="list file")
	17	+parser.add_argument("--outfile", type=str, default=None, help="output file std")
	18	+
	19	+args = vars(parser.parse_args())
	20	+MODEL = args["model"]
	21	+FEATURES = args["features"]
	22	+LST = args["list"]
	23	+OUTFILE = args["outfile"]
	24	+
	25	+if OUTFILE == None:
	26	+ OUTFILE = sys.stdout
	27	+else:
	28	+ OUTFILE = open(OUTFILE, "w")
	29	+
	30	+# -- READ FILE --
	31	+features = read_file(FEATURES)
	32	+feat_ind = index_by_id(features)
	33	+
	34	+lst = read_file(LST)
	35	+
	36	+kmeans = pickle.load(open(MODEL, "rb"))
	37	+
	38	+
	39	+# -- CONVERT TO NUMPY --
	40	+X = np.asarray([feat_ind[x[0][0]][x[0][3]][1] for x in lst])
	41	+predictions = kmeans.predict(X)
	42	+
	43	+for i, line in enumerate(lst):
	44	+ meta = line[0]
	45	+ meta[1] = str(predictions[i])
	46	+ write_line(
	47	+ meta,
	48	+ feat_ind[meta[0]][meta[3]][1],
	49	+ OUTFILE
	50	+ )
	51	+
	52	+# -- CLOSE OUT FILE IF NECESSARY --
	53	+if not OUTFILE == sys.stdout:
	54	+ OUTFILE.close()

bin/extract_vectors.py

Diff comments View file @ ac78b07

	1	+'''
	2	+The goal of this script is to extract vectors from a list.
	3	+One file is the full content, and the list only enumerate the
	4	+vectors you want to keep.
	5	+'''
	6	+
	7	+import os
	8	+import numpy as np
	9	+import argparse
	10	+
	11	+parser = argparse.ArgumentParser(description='Extract a subset of vectors')
	12	+parser.add_argument('vectorsfile', type=str,
	13	+ help='the path of the file containing the convectors')
	14	+parser.add_argument('listfile', type=str,
	15	+ help='the path of the file containing the list of vectors kept')
	16	+parser.add_argument('-o', '--output', type=str,
	17	+ default='a.out',
	18	+ help='the path the output file containing the vectors kept')
	19	+
	20	+args = parser.parse_args()
	21	+
	22	+# Editing global variable
	23	+VECTOR_FILE = args.vectorsfile
	24	+LIST_FILE = args.listfile
	25	+OUTPUT_FILE = args.output
	26	+
	27	+# READ VECTOR DATA
	28	+data = {}
	29	+data["en-us"] = {}
	30	+data["fr-fr"] = {}
	31	+with open(VECTOR_FILE, "r") as f:
	32	+ for i, line in enumerate(f):
	33	+ if TOY_VERSION == True and i > 100:
	34	+ break
	35	+ spl_line = line.split(" ")
	36	+ if(len(pvectors) == 0):
	37	+ pvectors = np.empty((0, len(spl_line[1:])), np.float32)
	38	+ spl_meta = spl_line.split(",")
	39	+ lang = spl_meta[0]
	40	+ iden = spl_meta[3]
	41	+ data[lang][iden] = line
	42	+
	43	+# READ LIST AND WRITE NEW FILE
	44	+with open(LIST_FILE, "r") as f, open(OUTPUT_FILE, "w") as o:
	45	+ for i, line in enumerate(LIST_FILE):
	46	+ if TOY_VERSION == True and i > 100:
	47	+ break
	48	+ spl_meta = line.split(",")
	49	+ lang = spl_meta[0]
	50	+ iden = spl_meta[3]
	51	+ OUTPUT_FILE.write(data[lang][iden])

bin/plot.py

Diff comments View file @ ac78b07

	1	+'''
	2	+Take a file and plot its data onto a 2d or 3d axis depending on the data.
	3	+'''
	4	+
	5	+import os
	6	+import numpy as np
	7	+from sklearn.cluster import KMeans
	8	+import matplotlib.pyplot as plt
	9	+import argparse
	10	+import json
	11	+
	12	+# Defining argparse
	13	+parser = argparse.ArgumentParser(prog='Plotter', description='Plot a file of 2d ou 3d dimension')
	14	+parser.add_argument('filepath', type=str,
	15	+ help='the path of the file you want to plot')
	16	+parser.add_argument('-o-', '--output', type=str,
	17	+ default='plot.pdf',
	18	+ help='the path of the ploted file')
	19	+parser.add_argument('-t', '--toy', action='store_true',
	20	+ help='test the script on a toy example. Do not test all the file content')
	21	+
	22	+args = parser.parse_args()
	23	+
	24	+# Editing global variable
	25	+FILE_PATH=args.filepath
	26	+OUTFILE_PATH = args.output
	27	+TOY_VERSION = args.toy
	28	+
	29	+# Defining vectors with default number of column
	30	+vectors = np.empty((0, 64), np.float32)
	31	+metas = np.empty((0, 4), np.float32)
	32	+
	33	+# READ DATA
	34	+with open(os.path.join(FILE_PATH), "r") as f:
	35	+ for i, line in enumerate(f):
	36	+ if TOY_VERSION == True and i > 100:
	37	+ break
	38	+ spl_line = line.split(" ")
	39	+ if(len(vectors) == 0):
	40	+ vectors = np.empty((0, len(spl_line[1:])), np.float32)
	41	+ metas = np.append(
	42	+ metas,
	43	+ np.asarray([spl_line[0].split(",")]),
	44	+ axis=0)
	45	+
	46	+ vectors = np.append(
	47	+ vectors,
	48	+ np.asarray([spl_line[1:]], dtype=np.float32),
	49	+ axis=0)
	50	+
	51	+vectors_T = np.transpose(vectors)
	52	+
	53	+
	54	+# Plot the file
	55	+plt.plot(vectors, 'ro')
	56	+fig, ax = plt.subplots()
	57	+
	58	+if(vectors_T.shape[0] == 2):
	59	+ ax.scatter(vectors_T[0], vectors_T[1]) #c=close, s=volume, alpha=0.5)
	60	+else:
	61	+ ax.scatter(vectors_T[0], vectors_T[1], vectors_T[2])
	62	+
	63	+ax.set_xlabel('Axe 1', fontsize=15)
	64	+ax.set_ylabel('Axe 2', fontsize=15)
	65	+
	66	+if(vectors_T.shape[0] == 3):
	67	+ ax.set_zlabel('Axe 3', fontsize15=15)
	68	+
	69	+ax.set_title('Volume and percent change')
	70	+plt.savefig(OUTFILE_PATH)

bin/plot_character.py

Diff comments View file @ ac78b07

	1	+'''
	2	+Take a file and plot its data onto a 2d or 3d axis depending on the data.
	3	+Automatic detection of the number of dimension.
	4	+'''
	5	+
	6	+import os
	7	+import numpy as np
	8	+from sklearn.cluster import KMeans
	9	+import matplotlib.pyplot as plt
	10	+import argparse
	11	+import json
	12	+import pandas as pd
	13	+
	14	+# Defining useful functions
	15	+
	16	+'''
	17	+Read the file whose content is metas and vectors.
	18	+Returns two numpy array : (metas, vectors)
	19	+
	20	+'''
	21	+def read_vector_file(filename, toy_version=False):
	22	+ vectors = np.empty((0, 1), np.float32)
	23	+ metas = np.empty((0, 4), np.float32)
	24	+ with open(filename, "r") as f:
	25	+ for i, line in enumerate(f):
	26	+ if toy_version == True and i > 100:
	27	+ break
	28	+ spl_line = line.split(" ")
	29	+ if(len(vectors) == 0):
	30	+ vectors = np.empty((0, len(spl_line[1:])), np.float32)
	31	+ metas = np.append(
	32	+ metas,
	33	+ np.asarray([spl_line[0].split(",")]),
	34	+ axis=0)
	35	+
	36	+ vectors = np.append(
	37	+ vectors,
	38	+ np.asarray([spl_line[1:]], dtype=np.float32),
	39	+ axis=0)
	40	+ return (metas, vectors)
	41	+
	42	+
	43	+# Defining argparse
	44	+parser = argparse.ArgumentParser(description='Plot a file of 2d ou 3d dimension')
	45	+parser.add_argument('vectorfile', type=str,
	46	+ help='the path of the vectors file')
	47	+parser.add_argument('-o-', '--output', type=str,
	48	+ default='plot.pdf',
	49	+ help='the path of the ploted file')
	50	+parser.add_argument('-t', '--toy', action='store_true',
	51	+ help='test the script on a toy example. Do not test all the file content')
	52	+
	53	+args = parser.parse_args()
	54	+
	55	+# Editing global variable
	56	+VECTORFILE_PATH=args.vectorfile
	57	+OUTFILE_PATH = args.output
	58	+TOY_VERSION = args.toy
	59	+
	60	+
	61	+# Get Vectors
	62	+metas, vectors = read_vector_file(VECTORFILE_PATH, toy_version = TOY_VERSION)
	63	+vectors_T = np.transpose(vectors)
	64	+
	65	+print("Number of characters: " + str(len(np.unique(np.transpose(metas)[1]))))
	66	+df = pd.DataFrame(dict(
	67	+ x=vectors_T[0],
	68	+ y=vectors_T[1],
	69	+ character=np.transpose(metas)[1]
	70	+ ))
	71	+
	72	+groups = df.groupby('character')
	73	+
	74	+# Plot
	75	+fig, ax = plt.subplots()
	76	+
	77	+for character, group in groups:
	78	+ ax.plot(group.x, group.y, marker='o', linestyle='', ms=2, label=character)
	79	+plt.savefig(OUTFILE_PATH)
	80	+print("Your plot is saved well (no check of this affirmation)")

bin/plot_clusters.py

Diff comments View file @ ac78b07

	1	+'''
	2	+Take a file and plot its data onto a 2d or 3d axis depending on the data.
	3	+'''
	4	+
	5	+import os
	6	+import numpy as np
	7	+from sklearn.cluster import KMeans
	8	+import matplotlib.pyplot as plt
	9	+import argparse
	10	+import json
	11	+import pandas as pd
	12	+
	13	+# Defining useful functions
	14	+
	15	+'''
	16	+Read the file whose content is metas and vectors.
	17	+Returns two numpy array : (metas, vectors)
	18	+
	19	+'''
	20	+def read_vector_file(filename, toy_version=False):
	21	+ vectors = np.empty((0, 1), np.float32)
	22	+ metas = np.empty((0, 4), np.float32)
	23	+ with open(filename, "r") as f:
	24	+ for i, line in enumerate(f):
	25	+ if toy_version == True and i > 100:
	26	+ break
	27	+ spl_line = line.split(" ")
	28	+ if(len(vectors) == 0):
	29	+ vectors = np.empty((0, len(spl_line[1:])), np.float32)
	30	+ metas = np.append(
	31	+ metas,
	32	+ np.asarray([spl_line[0].split(",")]),
	33	+ axis=0)
	34	+
	35	+ vectors = np.append(
	36	+ vectors,
	37	+ np.asarray([spl_line[1:]], dtype=np.float32),
	38	+ axis=0)
	39	+ return (metas, vectors)
	40	+
	41	+
	42	+'''
	43	+Check if the two given files have the same order.
	44	+'''
	45	+def check_files(vector_file, cluster_file):
	46	+ with open(vector_file, "r") as f1, open(cluster_file, "r") as f2:
	47	+ for line1, line2 in zip(f1, f2):
	48	+ line1_str = line1.strip()
	49	+ line2_str = line2.strip()
	50	+ metas1 = line1_str.split(" ")[0].split(",")
	51	+ metas2 = line2_str.split(" ")[0].split(",")
	52	+ if(not metas1[0] == metas2[0] or not metas1[3] == metas2[3]):
	53	+ return False
	54	+ return True
	55	+
	56	+
	57	+
	58	+
	59	+
	60	+# Defining argparse
	61	+parser = argparse.ArgumentParser(prog='Plotter', description='Plot a file of 2d ou 3d dimension')
	62	+parser.add_argument('clusterfile', type=str,
	63	+ help='the path of the cluster file')
	64	+parser.add_argument('vectorfile', type=str,
	65	+ help='the path of the vectors file')
	66	+parser.add_argument('-o-', '--output', type=str,
	67	+ default='plot.pdf',
	68	+ help='the path of the ploted file')
	69	+parser.add_argument('-t', '--toy', action='store_true',
	70	+ help='test the script on a toy example. Do not test all the file content')
	71	+
	72	+args = parser.parse_args()
	73	+
	74	+# Editing global variable
	75	+CLUSTERFILE_PATH=args.clusterfile
	76	+VECTORFILE_PATH=args.vectorfile
	77	+OUTFILE_PATH = args.output
	78	+TOY_VERSION = args.toy
	79	+
	80	+if check_files(VECTORFILE_PATH, CLUSTERFILE_PATH) == False:
	81	+ print("Les fichiers ne sont pas dans le meme ordre. Dans une version futur, cela générera une exception. On stop le processus.")
	82	+ exit(1)
	83	+
	84	+# Get Vectors
	85	+metas, vectors = read_vector_file(VECTORFILE_PATH, toy_version = TOY_VERSION)
	86	+vectors_T = np.transpose(vectors)
	87	+
	88	+# Get Clusters
	89	+metas, clusters = read_vector_file(CLUSTERFILE_PATH, toy_version = TOY_VERSION)
	90	+
	91	+#print(np.transpose(clusters)[0])
	92	+#print(np.transpose(metas)[0])
	93	+df = pd.DataFrame(dict(
	94	+ x=vectors_T[0],
	95	+ y=vectors_T[1],
	96	+ cluster=np.transpose(clusters)[0]
	97	+ ))
	98	+
	99	+groups = df.groupby('cluster')
	100	+
	101	+# Plot
	102	+fig, ax = plt.subplots()
	103	+
	104	+for cluster, group in groups:
	105	+ ax.plot(group.x, group.y, marker='o', linestyle='', ms=2, label=cluster)
	106	+ax.legend()
	107	+plt.savefig(OUTFILE_PATH)

bin/tsne_clustering_plot.py

Diff comments View file @ ac78b07

	1	+'''
	2	+Take one file with clustering
	3	+Take an other file with tsne
	4	+and then plot them
	5	+'''

bin/tsne_pvector.py

Diff comments View file @ ac78b07

	1	+'''
	2	+The goal of this script is to display calculate tsne of pvectors.
	3	+'''
	4	+
	5	+import os
	6	+import argparse
	7	+import numpy as np
	8	+from sklearn.manifold import TSNE
	9	+
	10	+# Defining argparse
	11	+parser = argparse.ArgumentParser(prog='pvector tsne', description='Calculate the tsne representation of pvector in 3 or 2d')
	12	+parser.add_argument('filepath', type=str,
	13	+ help='the path of the file you want to calculate tsne')
	14	+parser.add_argument('-o', '--output', type=str,
	15	+ default='.',
	16	+ help='the path of the output file.')
	17	+parser.add_argument('-n', '--n-comp', type=int, choices=[2, 3],
	18	+ default='2',
	19	+ help='number of components output of tsne')
	20	+parser.add_argument('-t', '--toy', action='store_true',
	21	+ help='test the script on a toy example. Do not test all the file content.')
	22	+args = parser.parse_args()
	23	+
	24	+# Editing global variable
	25	+FILE_PATH=args.filepath
	26	+OUTFILE_PATH=args.output
	27	+TOY_VERSION=args.toy
	28	+N_COMP=args.n_comp
	29	+
	30	+# Defining pvectors with default number of column
	31	+pvectors = np.empty((0, 64), np.float32)
	32	+metas = np.empty((0, 4), np.float32)
	33	+
	34	+
	35	+# READ DATA
	36	+with open(os.path.join(FILE_PATH), "r") as f:
	37	+ for i, line in enumerate(f):
	38	+ if TOY_VERSION == True and i > 100:
	39	+ break
	40	+ spl_line = line.split(" ")
	41	+ if(len(pvectors) == 0):
	42	+ pvectors = np.empty((0, len(spl_line[1:])), np.float32)
	43	+ metas = np.append(
	44	+ metas,
	45	+ np.asarray([spl_line[0].split(",")]),
	46	+ axis=0)
	47	+ pvectors = np.append(
	48	+ pvectors,
	49	+ np.asarray([spl_line[1:]], dtype=np.float32),
	50	+ axis=0)
	51	+
	52	+
	53	+
	54	+# PREPARE SAVE FILE FUNCTION
	55	+def save_file(filepath, metas, values):
	56	+ with open(filepath, "w") as f:
	57	+ for i, value in enumerate(values):
	58	+ metas_str = ",".join(str(v) for v in metas[i])
	59	+ try:
	60	+ infos_str = " ".join(str(v) for v in values[i])
	61	+ except TypeError as te:
	62	+ infos_str = str(values[i])
	63	+ f.write(metas_str + " " + infos_str + "\n")
	64	+
	65	+# CALCULATE T-SNE
	66	+X_embedded = TSNE(n_components=N_COMP).fit_transform(pvectors)
	67	+save_file(OUTFILE_PATH, metas, X_embedded)