Quillot Mathias / Clustering

Blame view

bin/plot_clusters.py 2.92 KB
  '''
  Take a file and plot its data onto a 2d or 3d axis depending on the data. 
  '''
  
  import os
  import numpy as np
  from sklearn.cluster import KMeans
  import matplotlib.pyplot as plt
  import argparse
  import json
  import pandas as pd
  
  # Defining useful functions 
  
  '''
  Read the file whose content is metas and vectors. 
  Returns two numpy array : (metas, vectors)
  
  '''
  def read_vector_file(filename, toy_version=False):
  	vectors = np.empty((0, 1), np.float32)
  	metas = np.empty((0, 4), np.float32)
  	with open(filename, "r") as f:
  		for i, line in enumerate(f):
  			if toy_version == True and i > 100:
  				break
  			spl_line = line.split(" ")
  			if(len(vectors) == 0):
  				vectors = np.empty((0, len(spl_line[1:])), np.float32)
  			metas = np.append(
  				metas,
  				np.asarray([spl_line[0].split(",")]),
  				axis=0)
  
  			vectors = np.append(
  				vectors,
  				np.asarray([spl_line[1:]], dtype=np.float32),
  				axis=0)
  	return (metas, vectors)
  
  
  '''
  Check if the two given files have the same order.
  '''
  def check_files(vector_file, cluster_file):
  	with open(vector_file, "r") as f1, open(cluster_file, "r") as f2:
  		for line1, line2 in zip(f1, f2):
  			line1_str = line1.strip()
  			line2_str = line2.strip()
  			metas1 = line1_str.split(" ")[0].split(",")
  			metas2 = line2_str.split(" ")[0].split(",")
  			if(not metas1[0] == metas2[0] or not metas1[3] == metas2[3]):
  				return False 
  		return True
  	
  		
  		
  
  
  # Defining argparse
  parser = argparse.ArgumentParser(prog='Plotter', description='Plot a file of 2d ou 3d dimension')
  parser.add_argument('clusterfile', type=str,
                      help='the path of the cluster file')
  parser.add_argument('vectorfile', type=str,
                      help='the path of the vectors file')
  parser.add_argument('-o-', '--output', type=str,
                      default='plot.pdf',
                      help='the path of the ploted file')
  parser.add_argument('-t', '--toy', action='store_true',
                      help='test the script on a toy example. Do not test all the file content')
  
  args = parser.parse_args()
  
  # Editing global variable
  CLUSTERFILE_PATH=args.clusterfile
  VECTORFILE_PATH=args.vectorfile
  OUTFILE_PATH = args.output
  TOY_VERSION = args.toy
  
  if check_files(VECTORFILE_PATH, CLUSTERFILE_PATH) == False:
  	print("Les fichiers ne sont pas dans le meme ordre. Dans une version futur, cela générera une exception. On stop le processus.")
  	exit(1)	
  
  # Get Vectors 
  metas, vectors = read_vector_file(VECTORFILE_PATH, toy_version = TOY_VERSION)
  vectors_T = np.transpose(vectors)
  
  # Get Clusters
  metas, clusters = read_vector_file(CLUSTERFILE_PATH, toy_version = TOY_VERSION)
  
  #print(np.transpose(clusters)[0])
  #print(np.transpose(metas)[0])
  df = pd.DataFrame(dict(
  		x=vectors_T[0],
  		y=vectors_T[1],
  		cluster=np.transpose(clusters)[0]
  	))
  
  groups = df.groupby('cluster')
  
  # Plot 
  fig, ax = plt.subplots()
  
  for cluster, group in groups:
  	ax.plot(group.x, group.y, marker='o', linestyle='', ms=2, label=cluster)
  ax.legend()
  plt.savefig(OUTFILE_PATH)