diff --git a/bin/regroup-measures.py b/bin/regroup-measures.py index 187a558..863932d 100644 --- a/bin/regroup-measures.py +++ b/bin/regroup-measures.py @@ -1,5 +1,8 @@ ''' Regroup results into one file and a plot. +TODO: Mettre en valeur les valeurs maximales +TODO: Sauvegarder les valeurs quelques part pour qu'on puisse facilement les retrouver. + ''' import numpy as np @@ -9,32 +12,46 @@ import os import json -def plot_values_clusters(filepath, values, title, xlabel, ylabel): - values = np.asarray(values) - x = np.arange(len(values)) + 2 - x_ticks = np.arange(len(values), step=5) + 2 - y = values - plt.scatter(x, y) - plt.xticks(x_ticks) - plt.title(title) - plt.xlabel(xlabel) - plt.ylabel(ylabel) - plt.savefig(filepath) - plt.close() +def plot_values_clusters(values, title, xlabel, ylabel): + values = np.asarray(values) + x = np.arange(len(values)) + 2 + x_ticks = np.arange(len(values), step=10) + 2 + y = values + plt.scatter(x, y, s=1) + plt.xticks(x_ticks) + plt.title(title) + plt.xlabel(xlabel) + plt.ylabel(ylabel) + + +def save_plot(filepath): + plt.savefig(filepath) + plt.close() + + +def save_results(outfile, measures, titles): + with open(outfile, "w") as f: + f.write(",".join(titles) + "\n") + n = len(measures[0]) + for i in range(n): + f.write(",".join([str(ms[i]) for ms in measures]) + "\n") + # -- PARSER parser = argparse.ArgumentParser(description="") parser.add_argument("expdir", type=str, help="Directory of experiment") -parser.add_argument("--measurefile", type=str, default="measures.json", help="Measure file it searchs in folders") -parser.add_argument("--suffix", type=str, default="", help="suffix of saved files") +parser.add_argument("--measurefile", type=str, default="measures.json", + help="Measure file it searchs in folders") +parser.add_argument("--suffix", type=str, default="", + help="suffix of saved files") args = parser.parse_args() EXP_DIR = args.expdir -MEASURE_FILE=args.measurefile +MEASURE_FILE = args.measurefile SUFFIX = args.suffix -#EXP_DIR="exp/kmeans_teacher_1/pvector-1" -RESULTS_DIR=os.path.join(EXP_DIR, "res") +# EXP_DIR="exp/kmeans_teacher_1/pvector-1" +RESULTS_DIR = os.path.join(EXP_DIR, "res") # -- CONFIG kmin = 2 @@ -43,7 +60,7 @@ kmax = 100 # -- CREATE FOLDER if not os.path.exists(RESULTS_DIR): - os.makedirs(RESULTS_DIR) + os.makedirs(RESULTS_DIR) # -- BEGIN REGROUPMENT @@ -51,62 +68,86 @@ subsets = ["train", "val"] disequilibriums = [] + def init_measures(): - measures = {} + measures = {} + + for subset in subsets: + measures[subset] = {} + measures[subset]["entropy"] = [] + measures[subset]["vscore"] = [] + measures[subset]["homogeneity"] = [] + measures[subset]["completeness"] = [] + return measures - for subset in subsets: - measures[subset] = {} - measures[subset]["entropy"] = [] - measures[subset]["vscore"] = [] - measures[subset]["homogeneity"] = [] - measures[subset]["completeness"] = [] - return measures measures = init_measures() for kfold in range(1, 5): - print(kfold) - for k in range(kmin, kmax+1): - measures_file = os.path.join(EXP_DIR, str(kfold), str(k), MEASURE_FILE) - with open(measures_file, 'r') as f: - meas_data = json.load(f) - disequilibriums.append(meas_data["disequilibrium"]) - for subset in subsets: - measures[subset]["entropy"].append(meas_data[subset]["entropy"]) - measures[subset]["vscore"].append(meas_data[subset]["vscore"]) - measures[subset]["homogeneity"].append(meas_data[subset]["homogeneity"]) - measures[subset]["completeness"].append(meas_data[subset]["completeness"]) - for subset in subsets: - plot_values_clusters( - os.path.join(RESULTS_DIR, "entropy_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".pdf"), - measures[subset]["entropy"], - "Entropy " + str(subset) + " set " + str(kfold), - "N clusters", - "Entropy") - plot_values_clusters( - os.path.join(RESULTS_DIR, "vscore_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".pdf"), - measures[subset]["vscore"], - "Vscore " + str(subset) + " set " + str(kfold), - "N clusters", - "Vscore") - plot_values_clusters( - os.path.join(RESULTS_DIR, "homogeneity_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".pdf"), - measures[subset]["homogeneity"], - "Homogeneity " + str(subset) + " set " + str(kfold), - "N clusters", - "Homogeneity") - plot_values_clusters( - os.path.join(RESULTS_DIR, "completeness_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".pdf"), - measures[subset]["completeness"], - "Completeness " + str(subset) + " set " + str(kfold), - "N clusters", - "Completeness") - plot_values_clusters( - os.path.join(RESULTS_DIR, "disequilibrium_" + str(kfold) + str(SUFFIX) + ".pdf"), - disequilibriums, - "Disequilibrium set " + str(kfold), - "N clusters", - "Disequilibrium") - - measures = init_measures() - disequilibriums = [] + print("Regrouping on kfold: " + str(kfold)) + # -- REGROUP MEASURES INTO LISTS + for k in range(kmin, kmax+1): + measures_file = os.path.join(EXP_DIR, str(kfold), str(k), MEASURE_FILE) + with open(measures_file, 'r') as f: + meas_data = json.load(f) + disequilibriums.append(meas_data["disequilibrium"]) + for subset in subsets: + measures[subset]["entropy"].append( + meas_data[subset]["entropy"]) + measures[subset]["vscore"].append( + meas_data[subset]["vscore"]) + measures[subset]["homogeneity"].append( + meas_data[subset]["homogeneity"]) + measures[subset]["completeness"].append( + meas_data[subset]["completeness"]) + + # -- PLOT AND SAVE MEASURES FOR A SPECIFIC SUBSET + for subset in subsets: + # Plot all measures + outf = "measures_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".pdf" + + fig = plt.figure(1) + for i, measure in enumerate(measures[subset]): + + plt.subplot(220 + i + 1) + + plot_values_clusters( + measures[subset][measure], + measure.capitalize() + " " + str(subset) + " set " + str(kfold), + "N clusters", + measure.capitalize()) + plt.subplots_adjust(hspace=0.5, wspace=0.3) + save_plot(os.path.join(RESULTS_DIR, outf)) + + # Save all measures on a csv file + save_results( + os.path.join(RESULTS_DIR, "measures_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".csv"), + [ + measures[subset]["entropy"], + measures[subset]["homogeneity"], + measures[subset]["completeness"], + measures[subset]["vscore"] + ], + [ + "entropy", + "homogeneity", + "completeness", + "vscore" + ] + ) + + # PLOT AND SAVE FOR DISEQUILIBRIUM + plot_values_clusters( + disequilibriums, + "Disequilibrium set " + str(kfold), + "N clusters", + "Disequilibrium") + save_plot(os.path.join(RESULTS_DIR, "disequilibrium_" + str(kfold) + str(SUFFIX) + ".pdf")) + + save_results( + os.path.join(RESULTS_DIR, "disequilibrium_" + str(kfold) + str(SUFFIX) + ".csv"), + [disequilibriums], + ["disequilibrium"]) + + measures = init_measures() + disequilibriums = []