plot_clusters.py 2.95 KB
edit raw blame history



1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112


'''
Take a file and plot its data onto a 2d or 3d axis depending on the data. 
'''

import os
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import argparse
import json
import pandas as pd

# Defining useful functions 

'''
Read the file whose content is metas and vectors. 
Returns two numpy array : (metas, vectors)

'''
def read_vector_file(filename, toy_version=False):
	vectors = np.empty((0, 1), np.float32)
	metas = np.empty((0, 4), np.float32)
	with open(filename, "r") as f:
		for i, line in enumerate(f):
			if toy_version == True and i > 100:
				break
			spl_line = line.split(" ")
			if(len(vectors) == 0):
				vectors = np.empty((0, len(spl_line[1:])), np.float32)
			metas = np.append(
				metas,
				np.asarray([spl_line[0].split(",")]),
				axis=0)

			vectors = np.append(
				vectors,
				np.asarray([spl_line[1:]], dtype=np.float32),
				axis=0)
	return (metas, vectors)


'''
Check if the two given files have the same order.
'''
def check_files(vector_file, cluster_file):
	with open(vector_file, "r") as f1, open(cluster_file, "r") as f2:
		for line1, line2 in zip(f1, f2):
			line1_str = line1.strip()
			line2_str = line2.strip()
			metas1 = line1_str.split(" ")[0].split(",")
			metas2 = line2_str.split(" ")[0].split(",")
			if(not metas1[0] == metas2[0] or not metas1[3] == metas2[3]):
				return False 
		return True
	
		
from data import read_file, index_by_id

# Defining argparse
parser = argparse.ArgumentParser(prog='Plotter', description='Plot a file of 2d ou 3d dimension')
parser.add_argument('clusterfile', type=str,
                    help='the path of the cluster file')
parser.add_argument('vectorfile', type=str,
                    help='the path of the vectors file')
parser.add_argument('-o-', '--output', type=str,
                    default='plot.pdf',
                    help='the path of the ploted file')

args = parser.parse_args()

# Editing global variable
CLUSTERFILE_PATH=args.clusterfile
VECTORFILE_PATH=args.vectorfile
OUTFILE_PATH = args.output

data_vector = read_file(VECTORFILE_PATH)
features = np.asarray([x[1] for x in data_vector])
features_T = np.transpose(features)

data_cluster = read_file(CLUSTERFILE_PATH)
data_cluster_ind = index_by_id(data_cluster)
clusters = [data_cluster_ind[x[0][0]][x[0][3]][0][1] for x in data_vector]


# TODO: compute tsne file
# TODO: adapt the script for the new library
df = pd.DataFrame(dict(
		x=features_T[0],
		y=features_T[1],
		cluster=np.transpose(clusters)[0]
	))
exit(1)
# Get Vectors 
metas, vectors = read_vector_file(VECTORFILE_PATH, toy_version = TOY_VERSION)
vectors_T = np.transpose(vectors)

# Get Clusters
metas, clusters = read_vector_file(CLUSTERFILE_PATH, toy_version = TOY_VERSION)

#print(np.transpose(clusters)[0])
#print(np.transpose(metas)[0])

groups = df.groupby('cluster')

# Plot 
fig, ax = plt.subplots()

for cluster, group in groups:
	ax.plot(group.x, group.y, marker='o', linestyle='', ms=2, label=cluster)
ax.legend()
plt.savefig(OUTFILE_PATH)