regroup-measures.py 4.35 KB
'''
Regroup results into one file and a plot.
TODO: Mettre en valeur les valeurs maximales
TODO: Sauvegarder les valeurs quelques part pour qu'on puisse facilement les retrouver.

'''

import numpy as np
import matplotlib.pyplot as plt
import argparse
import os
import json


def plot_values_clusters(values, title, xlabel, ylabel):
    values = np.asarray(values)
    x = np.arange(len(values)) + 2
    x_ticks = np.arange(len(values), step=10) + 2
    y = values
    plt.scatter(x, y, s=1)
    plt.xticks(x_ticks)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)


def save_plot(filepath):
    plt.savefig(filepath)
    plt.close()


def save_results(outfile, measures, titles):
    with open(outfile, "w") as f:
        f.write(",".join(titles) + "\n")
        n = len(measures[0])
        for i in range(n):
            f.write(",".join([str(ms[i]) for ms in measures]) + "\n")


# -- PARSER
parser = argparse.ArgumentParser(description="")
parser.add_argument("expdir", type=str, help="Directory of experiment")
parser.add_argument("--measurefile", type=str, default="measures.json", 
                    help="Measure file it searchs in folders")
parser.add_argument("--suffix", type=str, default="", 
                    help="suffix of saved files")

args = parser.parse_args()
EXP_DIR = args.expdir
MEASURE_FILE = args.measurefile
SUFFIX = args.suffix

# EXP_DIR="exp/kmeans_teacher_1/pvector-1"
RESULTS_DIR = os.path.join(EXP_DIR, "res")

# -- CONFIG
kmin = 2
kmax = 100


# -- CREATE FOLDER
if not os.path.exists(RESULTS_DIR):
    os.makedirs(RESULTS_DIR)

# -- BEGIN REGROUPMENT

subsets = ["train", "val"]

disequilibriums = []


def init_measures():
    measures = {}

    for subset in subsets:
        measures[subset] = {}
        measures[subset]["entropy"] = []
        measures[subset]["vscore"] = []
        measures[subset]["homogeneity"] = []
        measures[subset]["completeness"] = []
    return measures


measures = init_measures()

for kfold in range(1, 5):
    print("Regrouping on kfold: " + str(kfold))
    # -- REGROUP MEASURES INTO LISTS
    for k in range(kmin, kmax+1):
        measures_file = os.path.join(EXP_DIR, str(kfold), str(k), MEASURE_FILE)
        with open(measures_file, 'r') as f:
            meas_data = json.load(f)
        disequilibriums.append(meas_data["disequilibrium"])
        for subset in subsets:
            measures[subset]["entropy"].append(
                meas_data[subset]["entropy"])
            measures[subset]["vscore"].append(
                meas_data[subset]["vscore"])
            measures[subset]["homogeneity"].append(
                meas_data[subset]["homogeneity"])
            measures[subset]["completeness"].append(
                meas_data[subset]["completeness"])

    # -- PLOT AND SAVE MEASURES FOR A SPECIFIC SUBSET
    for subset in subsets:
        # Plot all measures
        outf = "measures_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".pdf"

        fig = plt.figure(1)
        for i, measure in enumerate(measures[subset]):

            plt.subplot(220 + i + 1)

            plot_values_clusters(
                measures[subset][measure],
                measure.capitalize() + " " + str(subset) + " set " + str(kfold),
                "N clusters",
                measure.capitalize())
        plt.subplots_adjust(hspace=0.5, wspace=0.3)
        save_plot(os.path.join(RESULTS_DIR, outf))

        # Save all measures on a csv file
        save_results(
            os.path.join(RESULTS_DIR, "measures_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".csv"), 
            [
                measures[subset]["entropy"],
                measures[subset]["homogeneity"],
                measures[subset]["completeness"],
                measures[subset]["vscore"]
            ],
            [
                "entropy",
                "homogeneity",
                "completeness",
                "vscore"
            ]
        )

    # PLOT AND SAVE FOR DISEQUILIBRIUM
    plot_values_clusters(
        disequilibriums,
        "Disequilibrium set " + str(kfold),
        "N clusters",
        "Disequilibrium")
    save_plot(os.path.join(RESULTS_DIR, "disequilibrium_" + str(kfold) + str(SUFFIX) + ".pdf"))

    save_results(
        os.path.join(RESULTS_DIR, "disequilibrium_" + str(kfold) + str(SUFFIX) + ".csv"), 
        [disequilibriums],
        ["disequilibrium"])

    measures = init_measures()
    disequilibriums = []