Quillot Mathias / Clustering

1

'''

1

'''

2

Take a file and plot its data onto a 2d or 3d axis depending on the data.

2

Take a file and plot its data onto a 2d or 3d axis depending on the data.

3

'''

3

'''

4

5

import os

5

import os

6

import numpy as np

6

import numpy as np

7

from sklearn.cluster import KMeans

7

from sklearn.cluster import KMeans

8

import matplotlib.pyplot as plt

8

import matplotlib.pyplot as plt

9

import argparse

9

import argparse

10

import json

10

import json

11

import pandas as pd

11

import pandas as pd

12

13

# Defining useful functions

13

# Defining useful functions

14

15

'''

15

'''

16

Read the file whose content is metas and vectors.

16

Read the file whose content is metas and vectors.

17

Returns two numpy array : (metas, vectors)

17

Returns two numpy array : (metas, vectors)

18

19

'''

19

'''

20

def read_vector_file(filename, toy_version=False):

20

def read_vector_file(filename, toy_version=False):

21

vectors = np.empty((0, 1), np.float32)

21

vectors = np.empty((0, 1), np.float32)

22

metas = np.empty((0, 4), np.float32)

22

metas = np.empty((0, 4), np.float32)

23

with open(filename, "r") as f:

23

with open(filename, "r") as f:

24

for i, line in enumerate(f):

24

for i, line in enumerate(f):

25

if toy_version == True and i > 100:

25

if toy_version == True and i > 100:

26

break

26

break

27

spl_line = line.split(" ")

27

spl_line = line.split(" ")

28

if(len(vectors) == 0):

28

if(len(vectors) == 0):

29

vectors = np.empty((0, len(spl_line[1:])), np.float32)

29

vectors = np.empty((0, len(spl_line[1:])), np.float32)

30

metas = np.append(

30

metas = np.append(

31

metas,

31

metas,

32

np.asarray([spl_line[0].split(",")]),

32

np.asarray([spl_line[0].split(",")]),

33

axis=0)

33

axis=0)

34

35

vectors = np.append(

35

vectors = np.append(

36

vectors,

36

vectors,

37

np.asarray([spl_line[1:]], dtype=np.float32),

37

np.asarray([spl_line[1:]], dtype=np.float32),

38

axis=0)

38

axis=0)

39

return (metas, vectors)

39

return (metas, vectors)

40

41

42

'''

42

'''

43

Check if the two given files have the same order.

43

Check if the two given files have the same order.

44

'''

44

'''

45

def check_files(vector_file, cluster_file):

45

def check_files(vector_file, cluster_file):

46

with open(vector_file, "r") as f1, open(cluster_file, "r") as f2:

46

with open(vector_file, "r") as f1, open(cluster_file, "r") as f2:

47

for line1, line2 in zip(f1, f2):

47

for line1, line2 in zip(f1, f2):

48

line1_str = line1.strip()

48

line1_str = line1.strip()

49

line2_str = line2.strip()

49

line2_str = line2.strip()

50

metas1 = line1_str.split(" ")[0].split(",")

50

metas1 = line1_str.split(" ")[0].split(",")

51

metas2 = line2_str.split(" ")[0].split(",")

51

metas2 = line2_str.split(" ")[0].split(",")

52

if(not metas1[0] == metas2[0] or not metas1[3] == metas2[3]):

52

if(not metas1[0] == metas2[0] or not metas1[3] == metas2[3]):

53

return False

53

return False

54

return True

54

return True

55

56

57

58

from data import read_file, index_by_id

58

59

60

# Defining argparse

60

# Defining argparse

61

parser = argparse.ArgumentParser(prog='Plotter', description='Plot a file of 2d ou 3d dimension')

61

parser = argparse.ArgumentParser(prog='Plotter', description='Plot a file of 2d ou 3d dimension')

62

parser.add_argument('clusterfile', type=str,

62

parser.add_argument('clusterfile', type=str,

63

help='the path of the cluster file')

63

help='the path of the cluster file')

64

parser.add_argument('vectorfile', type=str,

64

parser.add_argument('vectorfile', type=str,

65

help='the path of the vectors file')

65

help='the path of the vectors file')

66

parser.add_argument('-o-', '--output', type=str,

66

parser.add_argument('-o-', '--output', type=str,

67

default='plot.pdf',

67

default='plot.pdf',

68

help='the path of the ploted file')

68

help='the path of the ploted file')

69

parser.add_argument('-t', '--toy', action='store_true',

70

help='test the script on a toy example. Do not test all the file content')

71

69

72

args = parser.parse_args()

70

args = parser.parse_args()

73

71

74

# Editing global variable

72

# Editing global variable

75

CLUSTERFILE_PATH=args.clusterfile

73

CLUSTERFILE_PATH=args.clusterfile

76

VECTORFILE_PATH=args.vectorfile

74

VECTORFILE_PATH=args.vectorfile

77

OUTFILE_PATH = args.output

75

OUTFILE_PATH = args.output

78

TOY_VERSION = args.toy

79

76

80

if check_files(VECTORFILE_PATH, CLUSTERFILE_PATH) == False:

77

data_vector = read_file(VECTORFILE_PATH)

81

print("Les fichiers ne sont pas dans le meme ordre. Dans une version futur, cela générera une exception. On stop le processus.")

78

features = np.asarray([x[1] for x in data_vector])

82

exit(1)

79

features_T = np.transpose(features)

83

80

81

data_cluster = read_file(CLUSTERFILE_PATH)

82

data_cluster_ind = index_by_id(data_cluster)

83

clusters = [data_cluster_ind[x[0][0]][x[0][3]][0][1] for x in data_vector]

84

85

86

# TODO: compute tsne file

87

# TODO: adapt the script for the new library

88

df = pd.DataFrame(dict(

89

x=features_T[0],

90

y=features_T[1],

91

cluster=np.transpose(clusters)[0]

92

))

93

exit(1)

84

# Get Vectors

94

# Get Vectors

85

metas, vectors = read_vector_file(VECTORFILE_PATH, toy_version = TOY_VERSION)

95

metas, vectors = read_vector_file(VECTORFILE_PATH, toy_version = TOY_VERSION)

86

vectors_T = np.transpose(vectors)

96

vectors_T = np.transpose(vectors)

87

97

88

# Get Clusters

98

# Get Clusters

89

metas, clusters = read_vector_file(CLUSTERFILE_PATH, toy_version = TOY_VERSION)

99

metas, clusters = read_vector_file(CLUSTERFILE_PATH, toy_version = TOY_VERSION)

90

100

91

#print(np.transpose(clusters)[0])

101

#print(np.transpose(clusters)[0])

92

#print(np.transpose(metas)[0])

102

#print(np.transpose(metas)[0])

93

df = pd.DataFrame(dict(

94

x=vectors_T[0],

95

y=vectors_T[1],

96

cluster=np.transpose(clusters)[0]

97

))

98

103

99

groups = df.groupby('cluster')

104

groups = df.groupby('cluster')

GITLAB

Quillot Mathias / Clustering

Adapting the script to my library. Need to work again

 '''
 Take a file and plot its data onto a 2d or 3d axis depending on the data.
 '''
 import os
 import numpy as np
 from sklearn.cluster import KMeans
 import matplotlib.pyplot as plt
 import argparse
 import json
 import pandas as pd
 # Defining useful functions
 '''
 Read the file whose content is metas and vectors.
 Returns two numpy array : (metas, vectors)
 '''
 def read_vector_file(filename, toy_version=False):
 	vectors = np.empty((0, 1), np.float32)
 	metas = np.empty((0, 4), np.float32)
 	with open(filename, "r") as f:
 		for i, line in enumerate(f):
 			if toy_version == True and i > 100:
 				break
 			spl_line = line.split(" ")
 			if(len(vectors) == 0):
 				vectors = np.empty((0, len(spl_line[1:])), np.float32)
 			metas = np.append(
 				metas,
 				np.asarray([spl_line[0].split(",")]),
 				axis=0)
 			vectors = np.append(
 				vectors,
 				np.asarray([spl_line[1:]], dtype=np.float32),
 				axis=0)
 	return (metas, vectors)
 '''
 Check if the two given files have the same order.
 '''
 def check_files(vector_file, cluster_file):
 	with open(vector_file, "r") as f1, open(cluster_file, "r") as f2:
 		for line1, line2 in zip(f1, f2):
 			line1_str = line1.strip()
 			line2_str = line2.strip()
 			metas1 = line1_str.split(" ")[0].split(",")
 			metas2 = line2_str.split(" ")[0].split(",")
 			if(not metas1[0] == metas2[0] or not metas1[3] == metas2[3]):
 				return False
 		return True
+from data import read_file, index_by_id
 # Defining argparse
 parser = argparse.ArgumentParser(prog='Plotter', description='Plot a file of 2d ou 3d dimension')
 parser.add_argument('clusterfile', type=str,
                     help='the path of the cluster file')
 parser.add_argument('vectorfile', type=str,
                     help='the path of the vectors file')
 parser.add_argument('-o-', '--output', type=str,
                     default='plot.pdf',
                     help='the path of the ploted file')
-parser.add_argument('-t', '--toy', action='store_true',
-                    help='test the script on a toy example. Do not test all the file content')
 args = parser.parse_args()
 # Editing global variable
 CLUSTERFILE_PATH=args.clusterfile
 VECTORFILE_PATH=args.vectorfile
 OUTFILE_PATH = args.output
-TOY_VERSION = args.toy
-if check_files(VECTORFILE_PATH, CLUSTERFILE_PATH) == False:
+data_vector = read_file(VECTORFILE_PATH)
-	print("Les fichiers ne sont pas dans le meme ordre. Dans une version futur, cela générera une exception. On stop le processus.")
+features = np.asarray([x[1] for x in data_vector])
-	exit(1)
+features_T = np.transpose(features)
+data_cluster = read_file(CLUSTERFILE_PATH)
+data_cluster_ind = index_by_id(data_cluster)
+clusters = [data_cluster_ind[x[0][0]][x[0][3]][0][1] for x in data_vector]
+# TODO: compute tsne file
+# TODO: adapt the script for the new library
+df = pd.DataFrame(dict(
+		x=features_T[0],
+		y=features_T[1],
+		cluster=np.transpose(clusters)[0]
+	))
+exit(1)
 # Get Vectors
 metas, vectors = read_vector_file(VECTORFILE_PATH, toy_version = TOY_VERSION)
 vectors_T = np.transpose(vectors)
 # Get Clusters
 metas, clusters = read_vector_file(CLUSTERFILE_PATH, toy_version = TOY_VERSION)
 #print(np.transpose(clusters)[0])
 #print(np.transpose(metas)[0])
-df = pd.DataFrame(dict(
-		x=vectors_T[0],
-		y=vectors_T[1],
-		cluster=np.transpose(clusters)[0]
-	))
 groups = df.groupby('cluster')