From 933b2505a171366ecf2ae150b3a8ba49413d3695 Mon Sep 17 00:00:00 2001 From: Mathias Quillot Date: Mon, 22 Jul 2019 12:08:14 +0200 Subject: [PATCH] Add entropy --- bin/measure_clustering.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/bin/measure_clustering.py b/bin/measure_clustering.py index 82de23c..4bf36ac 100644 --- a/bin/measure_clustering.py +++ b/bin/measure_clustering.py @@ -47,6 +47,7 @@ val_lst = read_file(VAL_LST) train_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in train_lst]) train_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in train_lst], dtype=np.int) + val_classes = np.asarray([class_lst_ind[x[0][0]][x[0][3]][0][1] for x in val_lst]) val_clusters = np.asarray([clustering_ind[x[0][0]][x[0][3]][0][1] for x in val_lst], dtype=np.int) @@ -78,7 +79,7 @@ def generate_count_matrix(classes1, clusters1, classes2, clusters2): classe1_unique = np.unique(classes1) classe2_unique = np.unique(classes2) all_classes = np.unique(np.concatenate((classe1_unique, classe2_unique))) - + # Label Encoder for classes le = preprocessing.LabelEncoder() le.fit(all_classes) @@ -86,12 +87,15 @@ def generate_count_matrix(classes1, clusters1, classes2, clusters2): # Index cluster1_unique = np.unique(clusters1) cluster2_unique = np.unique(clusters2) - all_clusters = np.unique(np.concatenate((cluster1_unique, cluster2_unique))) + # Warning + if np.max(all_clusters) != len(cluster1_unique)-1: + print("WARNING: Some clusters are empty. Value max : " + str(np.max(all_clusters)) + " Nb values : " + str(len(cluster1_unique))) + # Create matrix lin(clust) col(class) - counts_matrix1 = np.zeros((len(all_clusters), len(all_classes))) - counts_matrix2 = np.zeros((len(all_clusters), len(all_classes))) + counts_matrix1 = np.zeros((np.max(all_clusters) + 1, len(all_classes))) + counts_matrix2 = np.zeros((np.max(all_clusters) + 1, len(all_classes))) for cluster in all_clusters: @@ -123,18 +127,22 @@ train_completeness = metrics.completeness_score(train_classes, train_clusters) val_completeness = metrics.completeness_score(val_classes, val_clusters) counts_matrix1, counts_matrix2 = generate_count_matrix(train_classes, train_clusters, val_classes, val_clusters) + mask, dis_human, dis_measures = disequilibrium(counts_matrix1, counts_matrix2, isGlobal=False) + (train_entropy_matrix, train_entropy) = entropy(counts_matrix1) (val_entropy_matrix, val_entropy) = entropy(counts_matrix2) results = {} results["train"] = {} +results["train"]["entropy"] = train_entropy results["train"]["vscore"] = train_vscore results["train"]["homogeneity"] = train_homogeneity results["train"]["completeness"] = val_completeness results["val"] = {} +results["val"]["entropy"] = val_entropy results["val"]["vscore"] = val_vscore results["val"]["homogeneity"] = val_homogeneity results["val"]["completeness"] = val_completeness -- 1.8.2.3