Commit ca0fcf2c3f1b1faa7cdef49e229d135c891176c6
1 parent
0b30718782
Exists in
master
Adapting the script to my library. Need to work again
Showing 1 changed file with 17 additions and 12 deletions Inline Diff
bin/plot_clusters.py
1 | ''' | 1 | ''' |
2 | Take a file and plot its data onto a 2d or 3d axis depending on the data. | 2 | Take a file and plot its data onto a 2d or 3d axis depending on the data. |
3 | ''' | 3 | ''' |
4 | 4 | ||
5 | import os | 5 | import os |
6 | import numpy as np | 6 | import numpy as np |
7 | from sklearn.cluster import KMeans | 7 | from sklearn.cluster import KMeans |
8 | import matplotlib.pyplot as plt | 8 | import matplotlib.pyplot as plt |
9 | import argparse | 9 | import argparse |
10 | import json | 10 | import json |
11 | import pandas as pd | 11 | import pandas as pd |
12 | 12 | ||
13 | # Defining useful functions | 13 | # Defining useful functions |
14 | 14 | ||
15 | ''' | 15 | ''' |
16 | Read the file whose content is metas and vectors. | 16 | Read the file whose content is metas and vectors. |
17 | Returns two numpy array : (metas, vectors) | 17 | Returns two numpy array : (metas, vectors) |
18 | 18 | ||
19 | ''' | 19 | ''' |
20 | def read_vector_file(filename, toy_version=False): | 20 | def read_vector_file(filename, toy_version=False): |
21 | vectors = np.empty((0, 1), np.float32) | 21 | vectors = np.empty((0, 1), np.float32) |
22 | metas = np.empty((0, 4), np.float32) | 22 | metas = np.empty((0, 4), np.float32) |
23 | with open(filename, "r") as f: | 23 | with open(filename, "r") as f: |
24 | for i, line in enumerate(f): | 24 | for i, line in enumerate(f): |
25 | if toy_version == True and i > 100: | 25 | if toy_version == True and i > 100: |
26 | break | 26 | break |
27 | spl_line = line.split(" ") | 27 | spl_line = line.split(" ") |
28 | if(len(vectors) == 0): | 28 | if(len(vectors) == 0): |
29 | vectors = np.empty((0, len(spl_line[1:])), np.float32) | 29 | vectors = np.empty((0, len(spl_line[1:])), np.float32) |
30 | metas = np.append( | 30 | metas = np.append( |
31 | metas, | 31 | metas, |
32 | np.asarray([spl_line[0].split(",")]), | 32 | np.asarray([spl_line[0].split(",")]), |
33 | axis=0) | 33 | axis=0) |
34 | 34 | ||
35 | vectors = np.append( | 35 | vectors = np.append( |
36 | vectors, | 36 | vectors, |
37 | np.asarray([spl_line[1:]], dtype=np.float32), | 37 | np.asarray([spl_line[1:]], dtype=np.float32), |
38 | axis=0) | 38 | axis=0) |
39 | return (metas, vectors) | 39 | return (metas, vectors) |
40 | 40 | ||
41 | 41 | ||
42 | ''' | 42 | ''' |
43 | Check if the two given files have the same order. | 43 | Check if the two given files have the same order. |
44 | ''' | 44 | ''' |
45 | def check_files(vector_file, cluster_file): | 45 | def check_files(vector_file, cluster_file): |
46 | with open(vector_file, "r") as f1, open(cluster_file, "r") as f2: | 46 | with open(vector_file, "r") as f1, open(cluster_file, "r") as f2: |
47 | for line1, line2 in zip(f1, f2): | 47 | for line1, line2 in zip(f1, f2): |
48 | line1_str = line1.strip() | 48 | line1_str = line1.strip() |
49 | line2_str = line2.strip() | 49 | line2_str = line2.strip() |
50 | metas1 = line1_str.split(" ")[0].split(",") | 50 | metas1 = line1_str.split(" ")[0].split(",") |
51 | metas2 = line2_str.split(" ")[0].split(",") | 51 | metas2 = line2_str.split(" ")[0].split(",") |
52 | if(not metas1[0] == metas2[0] or not metas1[3] == metas2[3]): | 52 | if(not metas1[0] == metas2[0] or not metas1[3] == metas2[3]): |
53 | return False | 53 | return False |
54 | return True | 54 | return True |
55 | 55 | ||
56 | 56 | ||
57 | 57 | ||
58 | from data import read_file, index_by_id | ||
58 | 59 | ||
59 | |||
60 | # Defining argparse | 60 | # Defining argparse |
61 | parser = argparse.ArgumentParser(prog='Plotter', description='Plot a file of 2d ou 3d dimension') | 61 | parser = argparse.ArgumentParser(prog='Plotter', description='Plot a file of 2d ou 3d dimension') |
62 | parser.add_argument('clusterfile', type=str, | 62 | parser.add_argument('clusterfile', type=str, |
63 | help='the path of the cluster file') | 63 | help='the path of the cluster file') |
64 | parser.add_argument('vectorfile', type=str, | 64 | parser.add_argument('vectorfile', type=str, |
65 | help='the path of the vectors file') | 65 | help='the path of the vectors file') |
66 | parser.add_argument('-o-', '--output', type=str, | 66 | parser.add_argument('-o-', '--output', type=str, |
67 | default='plot.pdf', | 67 | default='plot.pdf', |
68 | help='the path of the ploted file') | 68 | help='the path of the ploted file') |
69 | parser.add_argument('-t', '--toy', action='store_true', | ||
70 | help='test the script on a toy example. Do not test all the file content') | ||
71 | 69 | ||
72 | args = parser.parse_args() | 70 | args = parser.parse_args() |
73 | 71 | ||
74 | # Editing global variable | 72 | # Editing global variable |
75 | CLUSTERFILE_PATH=args.clusterfile | 73 | CLUSTERFILE_PATH=args.clusterfile |
76 | VECTORFILE_PATH=args.vectorfile | 74 | VECTORFILE_PATH=args.vectorfile |
77 | OUTFILE_PATH = args.output | 75 | OUTFILE_PATH = args.output |
78 | TOY_VERSION = args.toy | ||
79 | 76 | ||
80 | if check_files(VECTORFILE_PATH, CLUSTERFILE_PATH) == False: | 77 | data_vector = read_file(VECTORFILE_PATH) |
81 | print("Les fichiers ne sont pas dans le meme ordre. Dans une version futur, cela générera une exception. On stop le processus.") | 78 | features = np.asarray([x[1] for x in data_vector]) |
82 | exit(1) | 79 | features_T = np.transpose(features) |
83 | 80 | ||
81 | data_cluster = read_file(CLUSTERFILE_PATH) | ||
82 | data_cluster_ind = index_by_id(data_cluster) | ||
83 | clusters = [data_cluster_ind[x[0][0]][x[0][3]][0][1] for x in data_vector] | ||
84 | |||
85 | |||
86 | # TODO: compute tsne file | ||
87 | # TODO: adapt the script for the new library | ||
88 | df = pd.DataFrame(dict( | ||
89 | x=features_T[0], | ||
90 | y=features_T[1], | ||
91 | cluster=np.transpose(clusters)[0] | ||
92 | )) | ||
93 | exit(1) | ||
84 | # Get Vectors | 94 | # Get Vectors |
85 | metas, vectors = read_vector_file(VECTORFILE_PATH, toy_version = TOY_VERSION) | 95 | metas, vectors = read_vector_file(VECTORFILE_PATH, toy_version = TOY_VERSION) |
86 | vectors_T = np.transpose(vectors) | 96 | vectors_T = np.transpose(vectors) |
87 | 97 | ||
88 | # Get Clusters | 98 | # Get Clusters |
89 | metas, clusters = read_vector_file(CLUSTERFILE_PATH, toy_version = TOY_VERSION) | 99 | metas, clusters = read_vector_file(CLUSTERFILE_PATH, toy_version = TOY_VERSION) |
90 | 100 | ||
91 | #print(np.transpose(clusters)[0]) | 101 | #print(np.transpose(clusters)[0]) |
92 | #print(np.transpose(metas)[0]) | 102 | #print(np.transpose(metas)[0]) |
93 | df = pd.DataFrame(dict( | ||
94 | x=vectors_T[0], | ||
95 | y=vectors_T[1], | ||
96 | cluster=np.transpose(clusters)[0] | ||
97 | )) | ||
98 | 103 | ||
99 | groups = df.groupby('cluster') | 104 | groups = df.groupby('cluster') |