Commit ca0fcf2c3f1b1faa7cdef49e229d135c891176c6

Authored by Mathias Quillot
1 parent 0b30718782
Exists in master

Adapting the script to my library. Need to work again

Showing 1 changed file with 17 additions and 12 deletions Inline Diff

bin/plot_clusters.py
1 ''' 1 '''
2 Take a file and plot its data onto a 2d or 3d axis depending on the data. 2 Take a file and plot its data onto a 2d or 3d axis depending on the data.
3 ''' 3 '''
4 4
5 import os 5 import os
6 import numpy as np 6 import numpy as np
7 from sklearn.cluster import KMeans 7 from sklearn.cluster import KMeans
8 import matplotlib.pyplot as plt 8 import matplotlib.pyplot as plt
9 import argparse 9 import argparse
10 import json 10 import json
11 import pandas as pd 11 import pandas as pd
12 12
13 # Defining useful functions 13 # Defining useful functions
14 14
15 ''' 15 '''
16 Read the file whose content is metas and vectors. 16 Read the file whose content is metas and vectors.
17 Returns two numpy array : (metas, vectors) 17 Returns two numpy array : (metas, vectors)
18 18
19 ''' 19 '''
20 def read_vector_file(filename, toy_version=False): 20 def read_vector_file(filename, toy_version=False):
21 vectors = np.empty((0, 1), np.float32) 21 vectors = np.empty((0, 1), np.float32)
22 metas = np.empty((0, 4), np.float32) 22 metas = np.empty((0, 4), np.float32)
23 with open(filename, "r") as f: 23 with open(filename, "r") as f:
24 for i, line in enumerate(f): 24 for i, line in enumerate(f):
25 if toy_version == True and i > 100: 25 if toy_version == True and i > 100:
26 break 26 break
27 spl_line = line.split(" ") 27 spl_line = line.split(" ")
28 if(len(vectors) == 0): 28 if(len(vectors) == 0):
29 vectors = np.empty((0, len(spl_line[1:])), np.float32) 29 vectors = np.empty((0, len(spl_line[1:])), np.float32)
30 metas = np.append( 30 metas = np.append(
31 metas, 31 metas,
32 np.asarray([spl_line[0].split(",")]), 32 np.asarray([spl_line[0].split(",")]),
33 axis=0) 33 axis=0)
34 34
35 vectors = np.append( 35 vectors = np.append(
36 vectors, 36 vectors,
37 np.asarray([spl_line[1:]], dtype=np.float32), 37 np.asarray([spl_line[1:]], dtype=np.float32),
38 axis=0) 38 axis=0)
39 return (metas, vectors) 39 return (metas, vectors)
40 40
41 41
42 ''' 42 '''
43 Check if the two given files have the same order. 43 Check if the two given files have the same order.
44 ''' 44 '''
45 def check_files(vector_file, cluster_file): 45 def check_files(vector_file, cluster_file):
46 with open(vector_file, "r") as f1, open(cluster_file, "r") as f2: 46 with open(vector_file, "r") as f1, open(cluster_file, "r") as f2:
47 for line1, line2 in zip(f1, f2): 47 for line1, line2 in zip(f1, f2):
48 line1_str = line1.strip() 48 line1_str = line1.strip()
49 line2_str = line2.strip() 49 line2_str = line2.strip()
50 metas1 = line1_str.split(" ")[0].split(",") 50 metas1 = line1_str.split(" ")[0].split(",")
51 metas2 = line2_str.split(" ")[0].split(",") 51 metas2 = line2_str.split(" ")[0].split(",")
52 if(not metas1[0] == metas2[0] or not metas1[3] == metas2[3]): 52 if(not metas1[0] == metas2[0] or not metas1[3] == metas2[3]):
53 return False 53 return False
54 return True 54 return True
55 55
56 56
57 57
58 from data import read_file, index_by_id
58 59
59
60 # Defining argparse 60 # Defining argparse
61 parser = argparse.ArgumentParser(prog='Plotter', description='Plot a file of 2d ou 3d dimension') 61 parser = argparse.ArgumentParser(prog='Plotter', description='Plot a file of 2d ou 3d dimension')
62 parser.add_argument('clusterfile', type=str, 62 parser.add_argument('clusterfile', type=str,
63 help='the path of the cluster file') 63 help='the path of the cluster file')
64 parser.add_argument('vectorfile', type=str, 64 parser.add_argument('vectorfile', type=str,
65 help='the path of the vectors file') 65 help='the path of the vectors file')
66 parser.add_argument('-o-', '--output', type=str, 66 parser.add_argument('-o-', '--output', type=str,
67 default='plot.pdf', 67 default='plot.pdf',
68 help='the path of the ploted file') 68 help='the path of the ploted file')
69 parser.add_argument('-t', '--toy', action='store_true',
70 help='test the script on a toy example. Do not test all the file content')
71 69
72 args = parser.parse_args() 70 args = parser.parse_args()
73 71
74 # Editing global variable 72 # Editing global variable
75 CLUSTERFILE_PATH=args.clusterfile 73 CLUSTERFILE_PATH=args.clusterfile
76 VECTORFILE_PATH=args.vectorfile 74 VECTORFILE_PATH=args.vectorfile
77 OUTFILE_PATH = args.output 75 OUTFILE_PATH = args.output
78 TOY_VERSION = args.toy
79 76
80 if check_files(VECTORFILE_PATH, CLUSTERFILE_PATH) == False: 77 data_vector = read_file(VECTORFILE_PATH)
81 print("Les fichiers ne sont pas dans le meme ordre. Dans une version futur, cela générera une exception. On stop le processus.") 78 features = np.asarray([x[1] for x in data_vector])
82 exit(1) 79 features_T = np.transpose(features)
83 80
81 data_cluster = read_file(CLUSTERFILE_PATH)
82 data_cluster_ind = index_by_id(data_cluster)
83 clusters = [data_cluster_ind[x[0][0]][x[0][3]][0][1] for x in data_vector]
84
85
86 # TODO: compute tsne file
87 # TODO: adapt the script for the new library
88 df = pd.DataFrame(dict(
89 x=features_T[0],
90 y=features_T[1],
91 cluster=np.transpose(clusters)[0]
92 ))
93 exit(1)
84 # Get Vectors 94 # Get Vectors
85 metas, vectors = read_vector_file(VECTORFILE_PATH, toy_version = TOY_VERSION) 95 metas, vectors = read_vector_file(VECTORFILE_PATH, toy_version = TOY_VERSION)
86 vectors_T = np.transpose(vectors) 96 vectors_T = np.transpose(vectors)
87 97
88 # Get Clusters 98 # Get Clusters
89 metas, clusters = read_vector_file(CLUSTERFILE_PATH, toy_version = TOY_VERSION) 99 metas, clusters = read_vector_file(CLUSTERFILE_PATH, toy_version = TOY_VERSION)
90 100
91 #print(np.transpose(clusters)[0]) 101 #print(np.transpose(clusters)[0])
92 #print(np.transpose(metas)[0]) 102 #print(np.transpose(metas)[0])
93 df = pd.DataFrame(dict(
94 x=vectors_T[0],
95 y=vectors_T[1],
96 cluster=np.transpose(clusters)[0]
97 ))
98 103
99 groups = df.groupby('cluster') 104 groups = df.groupby('cluster')