measure_clustering.py
5.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
'''
Compute some measures from clustering like
disequilibrium and gini.
'''
# TODO: Juste disequilibrium par personnage pour commencer.
import argparse
from data import read_file, index_by_id
import numpy as np
from sklearn import preprocessing
from measures import disequilibrium, entropy
from sklearn import metrics
import json
# -- ARGPARSE
parser = argparse.ArgumentParser(description="Compute metrics from clustering")
parser.add_argument("clustering", type=str,
help="clustering file")
parser.add_argument("classlst", type=str,
help="List used for its classes.")
parser.add_argument("trainlst", type=str,
help="train list")
parser.add_argument("vallst", type=str,
help="val lst")
parser.add_argument("--outfile", type=str, default="out.out",
help="output file path")
args = parser.parse_args()
CLUSTERING = args.clustering
CLASS_LST = args.classlst
TRAIN_LST = args.trainlst
VAL_LST = args.vallst
OUTFILE = args.outfile
# -- READ FILES
clustering = read_file(CLUSTERING)
clustering_ind = index_by_id(clustering)
class_lst = read_file(CLASS_LST)
class_lst_ind = index_by_id(class_lst)
train_lst = read_file(TRAIN_LST)
val_lst = read_file(VAL_LST)
# -- GET CLASSES AND CLUSTERS
train_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in train_lst])
train_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in train_lst], dtype=np.int)
val_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in val_lst])
val_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in val_lst], dtype=np.int)
unique, count = np.unique(train_clusters, return_counts=True)
train_cluster_ind = dict(zip(unique, count))
unique, count = np.unique(val_clusters, return_counts=True)
val_cluster_ind = dict(zip(unique, count))
#print(np.unique(train_classes, return_counts=True))
#sub = np.extract(train_clusters == 1, train_classes)
#print(np.unique(sub, return_counts=True))
def generate_count_matrix(classes1, clusters1, classes2, clusters2):
'''
Generate matrices for which sets.
Lines are clusters and columns are classes.
A cell is contains the number of character occurence
on a specific cluster.
'''
# Index Classes
classe1_unique = np.unique(classes1)
classe2_unique = np.unique(classes2)
all_classes = np.unique(np.concatenate((classe1_unique, classe2_unique)))
# Label Encoder for classes
le = preprocessing.LabelEncoder()
le.fit(all_classes)
# Index
cluster1_unique = np.unique(clusters1)
cluster2_unique = np.unique(clusters2)
all_clusters = np.unique(np.concatenate((cluster1_unique, cluster2_unique)))
# Warning
if np.max(all_clusters) != len(cluster1_unique)-1:
print("WARNING: Some clusters are empty. Value max : " + str(np.max(all_clusters)) + " Nb values : " + str(len(cluster1_unique)))
# Create matrix lin(clust) col(class)
counts_matrix1 = np.zeros((np.max(all_clusters) + 1, len(all_classes)))
counts_matrix2 = np.zeros((np.max(all_clusters) + 1, len(all_classes)))
for cluster in all_clusters:
# Il faut d'abord extraire les classes présentes dans ce cluster
cc1 = np.extract(np.asarray(clusters1) == cluster, np.asarray(classes1))
cc2 = np.extract(np.asarray(clusters2) == cluster, np.asarray(classes2))
cc1_unique, cc1_counts = np.unique(cc1, return_counts=True)
cc1_ind = dict(zip(cc1_unique, cc1_counts))
cc2_unique, cc2_counts = np.unique(cc2, return_counts=True)
cc2_ind = dict(zip(cc2_unique, cc2_counts))
for class_ in all_classes:
class_id = le.transform([class_])[0]
if class_ in cc1_ind:
counts_matrix1[int(cluster)][int(class_id)] = cc1_ind[class_]
if class_ in cc2_ind:
counts_matrix2[int(cluster)][int(class_id)] = cc2_ind[class_]
return (counts_matrix1, counts_matrix2)
train_vscore = metrics.cluster.v_measure_score(train_classes, train_clusters)
val_vscore = metrics.cluster.v_measure_score(val_classes, val_clusters)
train_homogeneity = metrics.homogeneity_score(train_classes, train_clusters)
val_homogeneity = homogeneity_val = metrics.homogeneity_score(val_classes, val_clusters)
train_completeness = metrics.completeness_score(train_classes, train_clusters)
val_completeness = metrics.completeness_score(val_classes, val_clusters)
counts_matrix1, counts_matrix2 = generate_count_matrix(train_classes, train_clusters, val_classes, val_clusters)
mask, dis_human, dis_measures = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False)
(train_entropy_matrix, train_entropy) = entropy(counts_matrix1)
(val_entropy_matrix, val_entropy) = entropy(counts_matrix2)
results = {}
results["train"] = {}
results["train"]["entropy"] = train_entropy
results["train"]["vscore"] = train_vscore
results["train"]["homogeneity"] = train_homogeneity
results["train"]["completeness"] = val_completeness
results["val"] = {}
results["val"]["entropy"] = val_entropy
results["val"]["vscore"] = val_vscore
results["val"]["homogeneity"] = val_homogeneity
results["val"]["completeness"] = val_completeness
results["disequilibrium"] = dis_measures
#results = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False, mod="pow")
with open(OUTFILE, "w") as f:
json_content = json.dumps(results)
f.write(json_content)