regroup-measures.py 3.21 KB
'''
Regroup results into one file and a plot.
'''

import numpy as np
import matplotlib.pyplot as plt
import argparse
import os
import json


def plot_values_clusters(filepath, values, title, xlabel, ylabel):
	values = np.asarray(values)
	x = np.arange(len(values)) + 2
	x_ticks = np.arange(len(values), step=5) + 2
	y = values
	plt.scatter(x, y) 
	plt.xticks(x_ticks)
	plt.title(title)
	plt.xlabel(xlabel)
	plt.ylabel(ylabel)
	plt.savefig(filepath)
	plt.close()

# -- PARSER
parser = argparse.ArgumentParser(description="")
parser.add_argument("expdir", type=str, help="Directory of experiment")
parser.add_argument("--measurefile", type=str, default="measures.json", help="Measure file it searchs in folders")
parser.add_argument("--suffix", type=str, default="", help="suffix of saved files")

args = parser.parse_args()
EXP_DIR = args.expdir
MEASURE_FILE=args.measurefile
SUFFIX = args.suffix

#EXP_DIR="exp/kmeans_teacher_1/pvector-1"
RESULTS_DIR=os.path.join(EXP_DIR, "res")

# -- CONFIG
kmin = 2
kmax = 100


# -- CREATE FOLDER
if not os.path.exists(RESULTS_DIR):
	os.makedirs(RESULTS_DIR)

# -- BEGIN REGROUPMENT

subsets = ["train", "val"]

disequilibriums = []

def init_measures():
	measures = {}

	for subset in subsets:
		measures[subset] = {}
		measures[subset]["entropy"] = []
		measures[subset]["vscore"] = []
		measures[subset]["homogeneity"] = []
		measures[subset]["completeness"] = []
	return measures

measures = init_measures()

for kfold in range(1, 5):
	print(kfold)
	for k in range(kmin, kmax+1):
		measures_file = os.path.join(EXP_DIR, str(kfold), str(k), MEASURE_FILE)
		with open(measures_file, 'r') as f:
			meas_data = json.load(f)
		disequilibriums.append(meas_data["disequilibrium"])
		for subset in subsets:
			measures[subset]["entropy"].append(meas_data[subset]["entropy"])
			measures[subset]["vscore"].append(meas_data[subset]["vscore"])
			measures[subset]["homogeneity"].append(meas_data[subset]["homogeneity"])
			measures[subset]["completeness"].append(meas_data[subset]["completeness"])
	for subset in subsets:	
		plot_values_clusters(
			os.path.join(RESULTS_DIR, "entropy_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".pdf"),
			measures[subset]["entropy"],
			"Entropy " + str(subset) + " set " + str(kfold),
			"N clusters",
			"Entropy")
		plot_values_clusters(
			os.path.join(RESULTS_DIR, "vscore_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".pdf"),
			measures[subset]["vscore"],
			"Vscore " + str(subset) + " set " + str(kfold),
			"N clusters",
			"Vscore")
		plot_values_clusters(
			os.path.join(RESULTS_DIR, "homogeneity_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".pdf"),
			measures[subset]["homogeneity"],
			"Homogeneity " + str(subset) + " set " + str(kfold),
			"N clusters",
			"Homogeneity")
		plot_values_clusters(
			os.path.join(RESULTS_DIR, "completeness_" + str(subset) + "_"  + str(kfold) + str(SUFFIX) + ".pdf"),
			measures[subset]["completeness"],
			"Completeness " + str(subset) + " set " + str(kfold),
			"N clusters",
			"Completeness")
	plot_values_clusters(
		os.path.join(RESULTS_DIR, "disequilibrium_" + str(kfold) + str(SUFFIX) + ".pdf"),
		disequilibriums,
		"Disequilibrium set " + str(kfold),
		"N clusters",
		"Disequilibrium")
	
	measures = init_measures()
	disequilibriums = []