regroup-measures.py
3.21 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
'''
Regroup results into one file and a plot.
'''
import numpy as np
import matplotlib.pyplot as plt
import argparse
import os
import json
def plot_values_clusters(filepath, values, title, xlabel, ylabel):
values = np.asarray(values)
x = np.arange(len(values)) + 2
x_ticks = np.arange(len(values), step=5) + 2
y = values
plt.scatter(x, y)
plt.xticks(x_ticks)
plt.title(title)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.savefig(filepath)
plt.close()
# -- PARSER
parser = argparse.ArgumentParser(description="")
parser.add_argument("expdir", type=str, help="Directory of experiment")
parser.add_argument("--measurefile", type=str, default="measures.json", help="Measure file it searchs in folders")
parser.add_argument("--suffix", type=str, default="", help="suffix of saved files")
args = parser.parse_args()
EXP_DIR = args.expdir
MEASURE_FILE=args.measurefile
SUFFIX = args.suffix
#EXP_DIR="exp/kmeans_teacher_1/pvector-1"
RESULTS_DIR=os.path.join(EXP_DIR, "res")
# -- CONFIG
kmin = 2
kmax = 100
# -- CREATE FOLDER
if not os.path.exists(RESULTS_DIR):
os.makedirs(RESULTS_DIR)
# -- BEGIN REGROUPMENT
subsets = ["train", "val"]
disequilibriums = []
def init_measures():
measures = {}
for subset in subsets:
measures[subset] = {}
measures[subset]["entropy"] = []
measures[subset]["vscore"] = []
measures[subset]["homogeneity"] = []
measures[subset]["completeness"] = []
return measures
measures = init_measures()
for kfold in range(1, 5):
print(kfold)
for k in range(kmin, kmax+1):
measures_file = os.path.join(EXP_DIR, str(kfold), str(k), MEASURE_FILE)
with open(measures_file, 'r') as f:
meas_data = json.load(f)
disequilibriums.append(meas_data["disequilibrium"])
for subset in subsets:
measures[subset]["entropy"].append(meas_data[subset]["entropy"])
measures[subset]["vscore"].append(meas_data[subset]["vscore"])
measures[subset]["homogeneity"].append(meas_data[subset]["homogeneity"])
measures[subset]["completeness"].append(meas_data[subset]["completeness"])
for subset in subsets:
plot_values_clusters(
os.path.join(RESULTS_DIR, "entropy_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".pdf"),
measures[subset]["entropy"],
"Entropy " + str(subset) + " set " + str(kfold),
"N clusters",
"Entropy")
plot_values_clusters(
os.path.join(RESULTS_DIR, "vscore_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".pdf"),
measures[subset]["vscore"],
"Vscore " + str(subset) + " set " + str(kfold),
"N clusters",
"Vscore")
plot_values_clusters(
os.path.join(RESULTS_DIR, "homogeneity_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".pdf"),
measures[subset]["homogeneity"],
"Homogeneity " + str(subset) + " set " + str(kfold),
"N clusters",
"Homogeneity")
plot_values_clusters(
os.path.join(RESULTS_DIR, "completeness_" + str(subset) + "_" + str(kfold) + str(SUFFIX) + ".pdf"),
measures[subset]["completeness"],
"Completeness " + str(subset) + " set " + str(kfold),
"N clusters",
"Completeness")
plot_values_clusters(
os.path.join(RESULTS_DIR, "disequilibrium_" + str(kfold) + str(SUFFIX) + ".pdf"),
disequilibriums,
"Disequilibrium set " + str(kfold),
"N clusters",
"Disequilibrium")
measures = init_measures()
disequilibriums = []